From 1fee427908f67b8f191dd964c3cdd3071cd7a7e5 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 8 Aug 2020 11:53:13 -0500 Subject: [PATCH 1/6] ENH: Make explode work for sets --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/reshape.pyx | 7 +++++++ pandas/tests/frame/methods/test_explode.py | 7 +++++++ pandas/tests/series/methods/test_explode.py | 7 +++++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 33e70daa55e66..e0041383b32cc 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -37,7 +37,7 @@ For example: Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) -- +- :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 5c6c15fb50fed..e0efe9666da88 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -1,3 +1,5 @@ +from collections.abc import Set + import cython from cython import Py_ssize_t @@ -124,6 +126,9 @@ def explode(ndarray[object] values): counts = np.zeros(n, dtype='int64') for i in range(n): v = values[i] + if isinstance(v, Set): + v = list(v) + if c_is_list_like(v, False): if len(v): counts[i] += len(v) @@ -137,6 +142,8 @@ def explode(ndarray[object] values): count = 0 for i in range(n): v = values[i] + if isinstance(v, Set): + v = list(v) if c_is_list_like(v, False): if len(v): diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index 2bbe8ac2d5b81..e9df8b1fc8234 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -172,3 +172,10 @@ def test_ignore_index(): {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3] ) tm.assert_frame_equal(result, expected) + + +def test_explode_sets(): + df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1]) + result = df.explode(column="a").sort_values(by="a") + expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index 4b65e042f7b02..13400827147f9 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -126,3 +126,10 @@ def test_ignore_index(): result = s.explode(ignore_index=True) expected = pd.Series([1, 2, 3, 4], index=[0, 1, 2, 3], dtype=object) tm.assert_series_equal(result, expected) + + +def test_explode_sets(): + s = pd.Series([{"a", "b", "c"}], index=[1]) + result = s.explode().sort_values() + expected = pd.Series(["a", "b", "c"], index=[1, 1, 1]) + tm.assert_series_equal(result, expected) From d206ad4a3aef5b21b2d714dfa0505adda794273f Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 8 Aug 2020 15:17:58 -0500 Subject: [PATCH 2/6] Import abc --- pandas/_libs/reshape.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index e0efe9666da88..677c957439673 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -1,4 +1,4 @@ -from collections.abc import Set +from collections import abc import cython from cython import Py_ssize_t @@ -126,7 +126,7 @@ def explode(ndarray[object] values): counts = np.zeros(n, dtype='int64') for i in range(n): v = values[i] - if isinstance(v, Set): + if isinstance(v, abc.Set): v = list(v) if c_is_list_like(v, False): @@ -142,7 +142,7 @@ def explode(ndarray[object] values): count = 0 for i in range(n): v = values[i] - if isinstance(v, Set): + if isinstance(v, abc.Set): v = list(v) if c_is_list_like(v, False): From e4b3397a911fb4bf5d286ecd9bbcbd70959038d7 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 8 Aug 2020 19:00:23 -0500 Subject: [PATCH 3/6] Always cast --- pandas/_libs/reshape.pyx | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 677c957439673..82e179a4c8b6d 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -1,5 +1,3 @@ -from collections import abc - import cython from cython import Py_ssize_t @@ -126,11 +124,10 @@ def explode(ndarray[object] values): counts = np.zeros(n, dtype='int64') for i in range(n): v = values[i] - if isinstance(v, abc.Set): - v = list(v) - if c_is_list_like(v, False): + if c_is_list_like(v, True): if len(v): + v = list(v) counts[i] += len(v) else: # empty list-like, use a nan marker @@ -142,11 +139,10 @@ def explode(ndarray[object] values): count = 0 for i in range(n): v = values[i] - if isinstance(v, abc.Set): - v = list(v) - if c_is_list_like(v, False): + if c_is_list_like(v, True): if len(v): + v = list(v) for j in range(len(v)): result[count] = v[j] count += 1 From c2ebff4e8dfae815510b77974e4403497c74c6a9 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 8 Aug 2020 20:01:16 -0500 Subject: [PATCH 4/6] Don't cast --- pandas/_libs/reshape.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 82e179a4c8b6d..75dbb4b74aabd 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -127,7 +127,6 @@ def explode(ndarray[object] values): if c_is_list_like(v, True): if len(v): - v = list(v) counts[i] += len(v) else: # empty list-like, use a nan marker From 83dae317a162e6fd727d2d219e2fac505a9e4b1a Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 12 Aug 2020 19:13:51 -0500 Subject: [PATCH 5/6] Issue number --- pandas/tests/frame/methods/test_explode.py | 1 + pandas/tests/series/methods/test_explode.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index e9df8b1fc8234..bd0901387eeed 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -175,6 +175,7 @@ def test_ignore_index(): def test_explode_sets(): + # https://github.com/pandas-dev/pandas/issues/35614 df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1]) result = df.explode(column="a").sort_values(by="a") expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1]) diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index 13400827147f9..1f0fbd1cc5ecb 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -129,6 +129,7 @@ def test_ignore_index(): def test_explode_sets(): + # https://github.com/pandas-dev/pandas/issues/35614 s = pd.Series([{"a", "b", "c"}], index=[1]) result = s.explode().sort_values() expected = pd.Series(["a", "b", "c"], index=[1, 1, 1]) From e6e513183304bb2ca88a29ef4d12d667f980b80a Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 22 Aug 2020 12:23:11 -0500 Subject: [PATCH 6/6] Update docs --- pandas/core/frame.py | 7 ++++--- pandas/core/series.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 547d86f221b5f..809786f3c6c58 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7074,10 +7074,11 @@ def explode( Notes ----- - This routine will explode list-likes including lists, tuples, + This routine will explode list-likes including lists, tuples, sets, Series, and np.ndarray. The result dtype of the subset rows will - be object. Scalars will be returned unchanged. Empty list-likes will - result in a np.nan for that row. + be object. Scalars will be returned unchanged, and empty list-likes will + result in a np.nan for that row. In addition, the ordering of rows in the + output will be non-deterministic when exploding sets. Examples -------- diff --git a/pandas/core/series.py b/pandas/core/series.py index e8bf87a39b572..a19685f6b93d8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3823,10 +3823,11 @@ def explode(self, ignore_index: bool = False) -> "Series": Notes ----- - This routine will explode list-likes including lists, tuples, + This routine will explode list-likes including lists, tuples, sets, Series, and np.ndarray. The result dtype of the subset rows will - be object. Scalars will be returned unchanged. Empty list-likes will - result in a np.nan for that row. + be object. Scalars will be returned unchanged, and empty list-likes will + result in a np.nan for that row. In addition, the ordering of elements in + the output will be non-deterministic when exploding sets. Examples --------