diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index c08e22af295f4..e838afdbbd083 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -843,6 +843,8 @@ Categorical - Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) - Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) - Bug in :class:`Series` constructor with scalar and ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19565`) +- Bug in :func:`pandas.factorize` returning the unique codes for the ``uniques``. This now returns a ``Categorical`` with the same dtype as the input (:issue:`19721`) +- Bug in :func:`pandas.factorize` including an item for missing values in the ``uniques`` return value (:issue:`19721`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9101fca58d5fa..de2e638265f1e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -435,6 +435,35 @@ def isin(comps, values): return f(comps, values) +def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): + """Factorize an array-like to labels and uniques. + + This doesn't do any coercion of types or unboxing before factorization. + + Parameters + ---------- + values : ndarray + check_nulls : bool + Whether to check for nulls in the hashtable's 'get_labels' method. + na_sentinel : int, default -1 + size_hint : int, optional + Passsed through to the hashtable's 'get_labels' method + + Returns + ------- + labels, uniques : ndarray + """ + (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) + + table = hash_klass(size_hint or len(values)) + uniques = vec_klass() + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) + + labels = _ensure_platform_int(labels) + uniques = uniques.to_array() + return labels, uniques + + @deprecate_kwarg(old_arg_name='order', new_arg_name=None) def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ @@ -442,8 +471,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): Parameters ---------- - values : ndarray (1-d) - Sequence + values : Sequence + ndarrays must be 1-D. Sequences that aren't pandas objects are + coereced to ndarrays before factorization. sort : boolean, default False Sort by values na_sentinel : int, default -1 @@ -458,26 +488,43 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): Series note: an array of Periods will ignore sort as it returns an always sorted - PeriodIndex + PeriodIndex. """ + # Implementation notes: This method is responsible for 3 things + # 1.) coercing data to array-like (ndarray, Index, extension array) + # 2.) factorizing labels and uniques + # 3.) Maybe boxing the output in an Index + # + # Step 2 is dispatched to extension types (like Categorical). They are + # responsible only for factorization. All data coercion, sorting and boxing + # should happen here. values = _ensure_arraylike(values) original = values - values, dtype, _ = _ensure_data(values) - (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) - - table = hash_klass(size_hint or len(values)) - uniques = vec_klass() - check_nulls = not is_integer_dtype(original) - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) - labels = _ensure_platform_int(labels) - uniques = uniques.to_array() + if is_categorical_dtype(values): + values = getattr(values, '_values', values) + labels, uniques = values.factorize() + dtype = original.dtype + else: + values, dtype, _ = _ensure_data(values) + check_nulls = not is_integer_dtype(original) + labels, uniques = _factorize_array(values, check_nulls, + na_sentinel=na_sentinel, + size_hint=size_hint) if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort - uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, - assume_unique=True) + try: + order = uniques.argsort() + order2 = order.argsort() + labels = take_1d(order2, labels, fill_value=na_sentinel) + uniques = uniques.take(order) + except TypeError: + # Mixed types, where uniques.argsort fails. + uniques, labels = safe_sort(uniques, labels, + na_sentinel=na_sentinel, + assume_unique=True) uniques = _reconstruct_data(uniques, dtype, original) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e23dc3b3e5b89..b37f88d8bfdce 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -7,6 +7,7 @@ from pandas import compat from pandas.compat import u, lzip from pandas._libs import lib, algos as libalgos +from pandas._libs.tslib import iNaT from pandas.core.dtypes.generic import ( ABCSeries, ABCIndexClass, ABCCategoricalIndex) @@ -364,10 +365,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) - @classmethod - def _constructor_from_sequence(cls, scalars): - return cls(scalars) - @property def categories(self): """The categories of this categorical. @@ -425,6 +422,10 @@ def _ndarray_values(self): def _constructor(self): return Categorical + @classmethod + def _constructor_from_sequence(cls, scalars): + return Categorical(scalars) + def copy(self): """ Copy constructor. """ return self._constructor(values=self._codes.copy(), @@ -2072,6 +2073,60 @@ def unique(self): take_codes = sorted(take_codes) return cat.set_categories(cat.categories.take(take_codes)) + def factorize(self, na_sentinel=-1): + """Encode the Categorical as an enumerated type. + + Parameters + ---------- + sort : boolean, default False + Sort by values + na_sentinel: int, default -1 + Value to mark "not found" + + Returns + ------- + labels : ndarray + An integer NumPy array that's an indexer into the original + Categorical + uniques : Categorical + A Categorical whose values are the unique values and + whose dtype matches the original CategoricalDtype. Note that if + there any unobserved categories in ``self`` will not be present + in ``uniques.values``. They will be present in + ``uniques.categories`` + + Examples + -------- + >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) + >>> labels, uniques = cat.factorize() + >>> labels + (array([0, 0, 1]), + >>> uniques + [a, c] + Categories (3, object): [a, b, c]) + + Missing values are handled + + >>> labels, uniques = pd.factorize(pd.Categorical(['a', 'b', None])) + >>> labels + array([ 0, 1, -1]) + >>> uniques + [a, b] + Categories (2, object): [a, b] + """ + from pandas.core.algorithms import _factorize_array + + codes = self.codes.astype('int64') + codes[codes == -1] = iNaT + # We set missing codes, normally -1, to iNaT so that the + # Int64HashTable treats them as missing values. + labels, uniques = _factorize_array(codes, check_nulls=True, + na_sentinel=na_sentinel) + uniques = self._constructor(self.categories.take(uniques), + categories=self.categories, + ordered=self.ordered) + return labels, uniques + def equals(self, other): """ Returns True if categorical arrays are equal. diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py new file mode 100644 index 0000000000000..61764ec0ff632 --- /dev/null +++ b/pandas/tests/categorical/test_algos.py @@ -0,0 +1,49 @@ +import pytest +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +@pytest.mark.parametrize('ordered', [True, False]) +@pytest.mark.parametrize('categories', [ + ['b', 'a', 'c'], + ['a', 'b', 'c', 'd'], +]) +def test_factorize(categories, ordered): + cat = pd.Categorical(['b', 'b', 'a', 'c', None], + categories=categories, + ordered=ordered) + labels, uniques = pd.factorize(cat) + expected_labels = np.array([0, 0, 1, 2, -1], dtype='int64') + expected_uniques = pd.Categorical(['b', 'a', 'c'], + categories=categories, + ordered=ordered) + + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_factorized_sort(): + cat = pd.Categorical(['b', 'b', None, 'a']) + labels, uniques = pd.factorize(cat, sort=True) + expected_labels = np.array([1, 1, -1, 0], dtype='int64') + expected_uniques = pd.Categorical(['a', 'b']) + + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_factorized_sort_ordered(): + cat = pd.Categorical(['b', 'b', None, 'a'], + categories=['c', 'b', 'a'], + ordered=True) + + labels, uniques = pd.factorize(cat, sort=True) + expected_labels = np.array([0, 0, -1, 1], dtype='int64') + expected_uniques = pd.Categorical(['b', 'a'], + categories=['c', 'b', 'a'], + ordered=True) + + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_categorical_equal(uniques, expected_uniques)