-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REF/BUG/API: factorizing categorical data #19938
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
18de376
9ef5be2
5e52b6f
e19ae86
121b682
a6bc405
0bfbc47
b25f383
2688c4f
2395a99
ab4f01c
65150f4
ad8173b
1e006d1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -435,15 +435,45 @@ def isin(comps, values): | |
return f(comps, values) | ||
|
||
|
||
def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): | ||
"""Factorize an array-like to labels and uniques. | ||
|
||
This doesn't do any coercion of types or unboxing before factorization. | ||
|
||
Parameters | ||
---------- | ||
values : ndarray | ||
check_nulls : bool | ||
Whether to check for nulls in the hashtable's 'get_labels' method. | ||
na_sentinel : int, default -1 | ||
size_hint : int, optional | ||
Passsed through to the hashtable's 'get_labels' method | ||
|
||
Returns | ||
------- | ||
labels, uniques : ndarray | ||
""" | ||
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) | ||
|
||
table = hash_klass(size_hint or len(values)) | ||
uniques = vec_klass() | ||
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) | ||
|
||
labels = _ensure_platform_int(labels) | ||
uniques = uniques.to_array() | ||
return labels, uniques | ||
|
||
|
||
@deprecate_kwarg(old_arg_name='order', new_arg_name=None) | ||
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): | ||
""" | ||
Encode input values as an enumerated type or categorical variable | ||
|
||
Parameters | ||
---------- | ||
values : ndarray (1-d) | ||
Sequence | ||
values : Sequence | ||
ndarrays must be 1-D. Sequences that aren't pandas objects are | ||
coereced to ndarrays before factorization. | ||
sort : boolean, default False | ||
Sort by values | ||
na_sentinel : int, default -1 | ||
|
@@ -458,26 +488,43 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): | |
Series | ||
|
||
note: an array of Periods will ignore sort as it returns an always sorted | ||
PeriodIndex | ||
PeriodIndex. | ||
""" | ||
# Implementation notes: This method is responsible for 3 things | ||
# 1.) coercing data to array-like (ndarray, Index, extension array) | ||
# 2.) factorizing labels and uniques | ||
# 3.) Maybe boxing the output in an Index | ||
# | ||
# Step 2 is dispatched to extension types (like Categorical). They are | ||
# responsible only for factorization. All data coercion, sorting and boxing | ||
# should happen here. | ||
|
||
values = _ensure_arraylike(values) | ||
original = values | ||
values, dtype, _ = _ensure_data(values) | ||
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) | ||
|
||
table = hash_klass(size_hint or len(values)) | ||
uniques = vec_klass() | ||
check_nulls = not is_integer_dtype(original) | ||
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) | ||
|
||
labels = _ensure_platform_int(labels) | ||
uniques = uniques.to_array() | ||
if is_categorical_dtype(values): | ||
values = getattr(values, '_values', values) | ||
labels, uniques = values.factorize() | ||
dtype = original.dtype | ||
else: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn't this actually be a check on the values if they have a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This PR is just a bugfix for categorical. But the structure will be very similar (I'll just change I'll implement EA.factorize today hopefully, but have to get things like unique and argsort working first. |
||
values, dtype, _ = _ensure_data(values) | ||
check_nulls = not is_integer_dtype(original) | ||
labels, uniques = _factorize_array(values, check_nulls, | ||
na_sentinel=na_sentinel, | ||
size_hint=size_hint) | ||
|
||
if sort and len(uniques) > 0: | ||
from pandas.core.sorting import safe_sort | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could move to the except (but no big deal) |
||
uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, | ||
assume_unique=True) | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed all the sorting from Categorical.factorize. All that logic is here. I don't think we want to just call
On some small bencharks (10,000 elements) this is about 25-40% faster. The only slow case, for which we still need safe_sort, is when the array is mixed. In that case things are about 10% slower. |
||
order = uniques.argsort() | ||
order2 = order.argsort() | ||
labels = take_1d(order2, labels, fill_value=na_sentinel) | ||
uniques = uniques.take(order) | ||
except TypeError: | ||
# Mixed types, where uniques.argsort fails. | ||
uniques, labels = safe_sort(uniques, labels, | ||
na_sentinel=na_sentinel, | ||
assume_unique=True) | ||
|
||
uniques = _reconstruct_data(uniques, dtype, original) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
from pandas import compat | ||
from pandas.compat import u, lzip | ||
from pandas._libs import lib, algos as libalgos | ||
from pandas._libs.tslib import iNaT | ||
|
||
from pandas.core.dtypes.generic import ( | ||
ABCSeries, ABCIndexClass, ABCCategoricalIndex) | ||
|
@@ -364,10 +365,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, | |
self._dtype = self._dtype.update_dtype(dtype) | ||
self._codes = coerce_indexer_dtype(codes, dtype.categories) | ||
|
||
@classmethod | ||
def _constructor_from_sequence(cls, scalars): | ||
return cls(scalars) | ||
|
||
@property | ||
def categories(self): | ||
"""The categories of this categorical. | ||
|
@@ -425,6 +422,10 @@ def _ndarray_values(self): | |
def _constructor(self): | ||
return Categorical | ||
|
||
@classmethod | ||
def _constructor_from_sequence(cls, scalars): | ||
return Categorical(scalars) | ||
|
||
def copy(self): | ||
""" Copy constructor. """ | ||
return self._constructor(values=self._codes.copy(), | ||
|
@@ -2072,6 +2073,60 @@ def unique(self): | |
take_codes = sorted(take_codes) | ||
return cat.set_categories(cat.categories.take(take_codes)) | ||
|
||
def factorize(self, na_sentinel=-1): | ||
"""Encode the Categorical as an enumerated type. | ||
|
||
Parameters | ||
---------- | ||
sort : boolean, default False | ||
Sort by values | ||
na_sentinel: int, default -1 | ||
Value to mark "not found" | ||
|
||
Returns | ||
------- | ||
labels : ndarray | ||
An integer NumPy array that's an indexer into the original | ||
Categorical | ||
uniques : Categorical | ||
A Categorical whose values are the unique values and | ||
whose dtype matches the original CategoricalDtype. Note that if | ||
there any unobserved categories in ``self`` will not be present | ||
in ``uniques.values``. They will be present in | ||
``uniques.categories`` | ||
|
||
Examples | ||
-------- | ||
>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) | ||
>>> labels, uniques = cat.factorize() | ||
>>> labels | ||
(array([0, 0, 1]), | ||
>>> uniques | ||
[a, c] | ||
Categories (3, object): [a, b, c]) | ||
|
||
Missing values are handled | ||
|
||
>>> labels, uniques = pd.factorize(pd.Categorical(['a', 'b', None])) | ||
>>> labels | ||
array([ 0, 1, -1]) | ||
>>> uniques | ||
[a, b] | ||
Categories (2, object): [a, b] | ||
""" | ||
from pandas.core.algorithms import _factorize_array | ||
|
||
codes = self.codes.astype('int64') | ||
codes[codes == -1] = iNaT | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The interface we have to I think its worth re-factoring this (maybe before this PR), though I suppose could be after. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that'd be nicer. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we actually want this to be public? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. factorize in general? I don’t see why not. It’s present on series and index. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #19938 (comment) was in reference to the API docs. We whitelist the methods on Categorical that are included in the API docs (just |
||
# We set missing codes, normally -1, to iNaT so that the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put the astype after the comment, looks awkward otherwise why do you think you need to do this? the point of the na_sentinel is to select the missing values There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. na_sentinel controls the missing marker for the output. We're modifying the input, since the Int64HashTable sees that they're missing, instead of the value -1. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should be fixed generally, e.g. we should have a way to pass in the missing value. can you fix There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not familiar with the hashtable code, but at a glance it looks like the null condition is included in the class definition template?
I'm not sure how to pass expressions down to cython as a parameter. Anyway, do we actually need this to be parameterized? Do we have other cases where we've needed to pass the null condition down? |
||
# Int64HashTable treats them as missing values. | ||
labels, uniques = _factorize_array(codes, check_nulls=True, | ||
na_sentinel=na_sentinel) | ||
uniques = self._constructor(self.categories.take(uniques), | ||
categories=self.categories, | ||
ordered=self.ordered) | ||
return labels, uniques | ||
|
||
def equals(self, other): | ||
""" | ||
Returns True if categorical arrays are equal. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import pytest | ||
import numpy as np | ||
|
||
import pandas as pd | ||
import pandas.util.testing as tm | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you should prob pull the categorical tests out of test_algos.py then There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For better or worse, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, yeah i guess just lots of tests for unique. |
||
@pytest.mark.parametrize('ordered', [True, False]) | ||
@pytest.mark.parametrize('categories', [ | ||
['b', 'a', 'c'], | ||
['a', 'b', 'c', 'd'], | ||
]) | ||
def test_factorize(categories, ordered): | ||
cat = pd.Categorical(['b', 'b', 'a', 'c', None], | ||
categories=categories, | ||
ordered=ordered) | ||
labels, uniques = pd.factorize(cat) | ||
expected_labels = np.array([0, 0, 1, 2, -1], dtype='int64') | ||
expected_uniques = pd.Categorical(['b', 'a', 'c'], | ||
categories=categories, | ||
ordered=ordered) | ||
|
||
tm.assert_numpy_array_equal(labels, expected_labels) | ||
tm.assert_categorical_equal(uniques, expected_uniques) | ||
|
||
|
||
def test_factorized_sort(): | ||
cat = pd.Categorical(['b', 'b', None, 'a']) | ||
labels, uniques = pd.factorize(cat, sort=True) | ||
expected_labels = np.array([1, 1, -1, 0], dtype='int64') | ||
expected_uniques = pd.Categorical(['a', 'b']) | ||
|
||
tm.assert_numpy_array_equal(labels, expected_labels) | ||
tm.assert_categorical_equal(uniques, expected_uniques) | ||
|
||
|
||
def test_factorized_sort_ordered(): | ||
cat = pd.Categorical(['b', 'b', None, 'a'], | ||
categories=['c', 'b', 'a'], | ||
ordered=True) | ||
|
||
labels, uniques = pd.factorize(cat, sort=True) | ||
expected_labels = np.array([0, 0, -1, 1], dtype='int64') | ||
expected_uniques = pd.Categorical(['b', 'a'], | ||
categories=['c', 'b', 'a'], | ||
ordered=True) | ||
|
||
tm.assert_numpy_array_equal(labels, expected_labels) | ||
tm.assert_categorical_equal(uniques, expected_uniques) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
see my comment below, but you might simpy dispatch on categricals and just return, mixing the impl is really confusing here.