|
19 | 19 | from pandas.core.base import NoNewAttributesMixin
|
20 | 20 | from pandas.util._decorators import Appender
|
21 | 21 | import re
|
| 22 | +import itertools |
22 | 23 | import pandas._libs.lib as lib
|
23 | 24 | import warnings
|
24 | 25 | import textwrap
|
@@ -837,23 +838,12 @@ def str_get_dummies(arr, sep='|'):
|
837 | 838 | --------
|
838 | 839 | pandas.get_dummies
|
839 | 840 | """
|
840 |
| - arr = arr.fillna('') |
841 |
| - try: |
842 |
| - arr = sep + arr + sep |
843 |
| - except TypeError: |
844 |
| - arr = sep + arr.astype(str) + sep |
845 |
| - |
846 |
| - tags = set() |
847 |
| - for ts in arr.str.split(sep): |
848 |
| - tags.update(ts) |
849 |
| - tags = sorted(tags - set([""])) |
850 | 841 |
|
851 |
| - dummies = np.empty((len(arr), len(tags)), dtype=np.int64) |
852 |
| - |
853 |
| - for i, t in enumerate(tags): |
854 |
| - pat = sep + t + sep |
855 |
| - dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x) |
856 |
| - return dummies, tags |
| 842 | + arr = [list() if el is np.nan else str(el).split(sep) for el in arr] |
| 843 | + tags = sorted(set(itertools.chain.from_iterable(arr))) |
| 844 | + result = np.array([[t in x for t in tags] for x in arr]) |
| 845 | + import pdb; pdb.set_trace() |
| 846 | + return result, tags |
857 | 847 |
|
858 | 848 |
|
859 | 849 | def str_join(arr, sep):
|
@@ -1411,7 +1401,6 @@ def _wrap_result(self, result, use_codes=True,
|
1411 | 1401 | if expand is None:
|
1412 | 1402 | # infer from ndim if expand is not specified
|
1413 | 1403 | expand = False if result.ndim == 1 else True
|
1414 |
| - |
1415 | 1404 | elif expand is True and not isinstance(self._orig, Index):
|
1416 | 1405 | # required when expand=True is explicitly specified
|
1417 | 1406 | # not needed when infered
|
@@ -1446,8 +1435,7 @@ def cons_row(x):
|
1446 | 1435 | return result
|
1447 | 1436 |
|
1448 | 1437 | if expand:
|
1449 |
| - result = list(result) |
1450 |
| - out = MultiIndex.from_tuples(result, names=name) |
| 1438 | + out = MultiIndex.from_arrays(np.transpose(result), names=name) |
1451 | 1439 | if out.nlevels == 1:
|
1452 | 1440 | # We had all tuples of length-one, which are
|
1453 | 1441 | # better represented as a regular Index.
|
@@ -1686,11 +1674,11 @@ def wrap(self, width, **kwargs):
|
1686 | 1674 | return self._wrap_result(result)
|
1687 | 1675 |
|
1688 | 1676 | @copy(str_get_dummies)
|
1689 |
| - def get_dummies(self, sep='|'): |
| 1677 | + def get_dummies(self, sep='|', dtype=None): |
1690 | 1678 | # we need to cast to Series of strings as only that has all
|
1691 | 1679 | # methods available for making the dummies...
|
1692 | 1680 | data = self._orig.astype(str) if self._is_categorical else self._data
|
1693 |
| - result, name = str_get_dummies(data, sep) |
| 1681 | + result, name = str_get_dummies(data, sep, dtype) |
1694 | 1682 | return self._wrap_result(result, use_codes=(not self._is_categorical),
|
1695 | 1683 | name=name, expand=True)
|
1696 | 1684 |
|
|
0 commit comments