diff --git a/pandas/core/strings.py b/pandas/core/strings.py index bd756491abd2f..79d940182912a 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -999,23 +999,21 @@ def str_get_dummies(arr, sep='|'): 1 0 0 0 2 1 0 1 """ - arr = arr.fillna('') - try: - arr = sep + arr + sep - except TypeError: - arr = sep + arr.astype(str) + sep + from pandas.core.reshape.reshape import get_dummies + from pandas import Series + + arr = arr.fillna('').astype('str') + + arr_split = arr.str.split(sep) - tags = set() - for ts in arr.str.split(sep): - tags.update(ts) - tags = sorted(tags - {""}) + stacked = Series(np.concatenate(arr_split)) if len(arr) > 0 else Series() + stacked[stacked == ''] = np.nan + stacked_idx = np.repeat(np.arange(len(arr)), arr_split.str.len()) - dummies = np.empty((len(arr), len(tags)), dtype=np.int64) + dummies_stacked = get_dummies(stacked) + dummies = dummies_stacked.groupby(by=stacked_idx).sum() - for i, t in enumerate(tags): - pat = sep + t + sep - dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x) - return dummies, tags + return dummies.values, dummies.columns.values def str_join(arr, sep):