From d95a42bc52f9aeb23a36fecc5f63f50335fdee1c Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Thu, 6 Jun 2019 09:58:54 +0100 Subject: [PATCH 1/5] Defer Series.str.get_dummies to pandas.get_dummies --- pandas/core/strings.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index bd756491abd2f..7e81da4e959ab 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -20,6 +20,7 @@ from pandas.core.algorithms import take_1d from pandas.core.base import NoNewAttributesMixin import pandas.core.common as com +from pandas.core.reshape.reshape import get_dummies _cpython_optimized_encoders = ( "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii" @@ -1005,17 +1006,14 @@ def str_get_dummies(arr, sep='|'): except TypeError: arr = sep + arr.astype(str) + sep - tags = set() - for ts in arr.str.split(sep): - tags.update(ts) - tags = sorted(tags - {""}) + arr_split = arr.str.split(sep) + stacked = np.concatenate(arr_split) + stacked_idx = np.repeat(np.arange(len(arr)), arr_split.str.len()) - dummies = np.empty((len(arr), len(tags)), dtype=np.int64) + dummies_stacked = get_dummies(stacked) + dummies = dummies_stacked.groupby(by=stacked_idx).sum() - for i, t in enumerate(tags): - pat = sep + t + sep - dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x) - return dummies, tags + return dummies.values, dummies.columns.values def str_join(arr, sep): From 7ccaf479432932e9fae6428ce314cf0256b150eb Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Fri, 7 Jun 2019 09:26:44 +0100 Subject: [PATCH 2/5] Properly handle NaN --- pandas/core/strings.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 7e81da4e959ab..b16c06118b66e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1001,13 +1001,11 @@ def str_get_dummies(arr, sep='|'): 2 1 0 1 """ arr = arr.fillna('') - try: - arr = sep + arr + sep - except TypeError: - arr = sep + arr.astype(str) + sep arr_split = arr.str.split(sep) - stacked = np.concatenate(arr_split) + + stacked = Series(np.concatenate(arr_split)) + stacked[stacked == ''] = np.nan stacked_idx = np.repeat(np.arange(len(arr)), arr_split.str.len()) dummies_stacked = get_dummies(stacked) From 11e168f2d350a17377cf009c277711e450d1dcb9 Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Fri, 7 Jun 2019 09:27:21 +0100 Subject: [PATCH 3/5] Move imports locally to resolve circular import conflict --- pandas/core/strings.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b16c06118b66e..b5370786dddfa 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -20,7 +20,6 @@ from pandas.core.algorithms import take_1d from pandas.core.base import NoNewAttributesMixin import pandas.core.common as com -from pandas.core.reshape.reshape import get_dummies _cpython_optimized_encoders = ( "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii" @@ -1000,6 +999,9 @@ def str_get_dummies(arr, sep='|'): 1 0 0 0 2 1 0 1 """ + from pandas.core.reshape.reshape import get_dummies + from pandas import Series + arr = arr.fillna('') arr_split = arr.str.split(sep) From 70e077ab5332349fc375f8f3f3c3a98b2f80eb4f Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Fri, 7 Jun 2019 13:08:32 +0100 Subject: [PATCH 4/5] Deal with empty input data --- pandas/core/strings.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b5370786dddfa..56a20ff15a726 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1002,7 +1002,11 @@ def str_get_dummies(arr, sep='|'): from pandas.core.reshape.reshape import get_dummies from pandas import Series - arr = arr.fillna('') + if len(arr) == 0: + empty = np.empty(0, dtype='object') + return empty, empty + + arr = arr.fillna('').astype('str') arr_split = arr.str.split(sep) From a98aa26d44363d2d749898f002100eff7ff3fda3 Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Sun, 16 Jun 2019 13:54:31 +0100 Subject: [PATCH 5/5] Deal with empty input data --- pandas/core/strings.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 56a20ff15a726..79d940182912a 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1002,15 +1002,11 @@ def str_get_dummies(arr, sep='|'): from pandas.core.reshape.reshape import get_dummies from pandas import Series - if len(arr) == 0: - empty = np.empty(0, dtype='object') - return empty, empty - arr = arr.fillna('').astype('str') arr_split = arr.str.split(sep) - stacked = Series(np.concatenate(arr_split)) + stacked = Series(np.concatenate(arr_split)) if len(arr) > 0 else Series() stacked[stacked == ''] = np.nan stacked_idx = np.repeat(np.arange(len(arr)), arr_split.str.len())