From a08e9403b796743ecd00980e57f85cfbb051166f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 23 Nov 2017 21:46:57 -0500 Subject: [PATCH 1/2] Propogating NaN values when using str.split (#18450) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/strings.py | 4 ++++ pandas/tests/test_strings.py | 9 +++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 637ccf0603e0f..a0d19fa25f188 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -141,6 +141,6 @@ Categorical Other ^^^^^ -- +- :meth:`Series.str.split()` will now propogate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) - - diff --git a/pandas/core/strings.py b/pandas/core/strings.py index abef6f6086dbd..9614641aa1abf 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1423,6 +1423,10 @@ def cons_row(x): return [x] result = [cons_row(x) for x in result] + if result: + # propogate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [x * max_len if x[0] is np.nan else x for x in result] if not isinstance(expand, bool): raise ValueError("expand must be True or False") diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f1b97081b6d93..512579cdc8ab9 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2086,6 +2086,15 @@ def test_rsplit_to_multiindex_expand(self): tm.assert_index_equal(result, exp) assert result.nlevels == 2 + def test_split_nan_expand(self): + s = Series(["foo,bar,baz", NA]) + result = s.str.split(",", expand=True) + exp = DataFrame([["foo", "bar", "baz"], [NA, NA, NA]]) + tm.assert_frame_equal(result, exp) + + # extra nan check - see GH 18463 + assert all(np.isnan(x) for x in result.iloc[1]) + def test_split_with_name(self): # GH 12617 From 5c644e86969745cd5c43df76316962c260d0e316 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 25 Nov 2017 16:47:18 -0500 Subject: [PATCH 2/2] small doc edits --- doc/source/whatsnew/v0.21.1.txt | 6 +++++- pandas/tests/test_strings.py | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index a0d19fa25f188..576b22fb990b1 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -138,9 +138,13 @@ Categorical - ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) - Bug in ``Categorical.unique()`` returning read-only ``codes`` array when all categories were ``NaN`` (:issue:`18051`) +String +^^^^^^ + +- :meth:`Series.str.split()` will now propogate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) + Other ^^^^^ -- :meth:`Series.str.split()` will now propogate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) - - diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 512579cdc8ab9..8aa69bcbfdf7f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2087,12 +2087,15 @@ def test_rsplit_to_multiindex_expand(self): assert result.nlevels == 2 def test_split_nan_expand(self): + # gh-18450 s = Series(["foo,bar,baz", NA]) result = s.str.split(",", expand=True) exp = DataFrame([["foo", "bar", "baz"], [NA, NA, NA]]) tm.assert_frame_equal(result, exp) - # extra nan check - see GH 18463 + # check that these are actually np.nan and not None + # TODO see GH 18463 + # tm.assert_frame_equal does not differentiate assert all(np.isnan(x) for x in result.iloc[1]) def test_split_with_name(self):