Skip to content

Commit f3532a4

Browse files
topper-123Pingviinituutti
authored andcommitted
API: Add dtype parameter to Categorical.from_codes (pandas-dev#24398)
* Add dtype to Categorical.from_codes
1 parent 4ed3de0 commit f3532a4

File tree

5 files changed

+124
-71
lines changed

5 files changed

+124
-71
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,7 @@ Other Enhancements
403403
- :meth:`pandas.api.types.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
404404
all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`)
405405
- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`).
406+
- :meth:`Categorical.from_codes` now can take a ``dtype`` parameter as an alternative to passing ``categories`` and ``ordered`` (:issue:`24398`).
406407
- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
407408
- Compatibility with Matplotlib 3.0 (:issue:`22790`).
408409
- Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)

pandas/core/arrays/categorical.py

+39-28
Original file line numberDiff line numberDiff line change
@@ -603,13 +603,13 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
603603
return cls(codes, dtype=dtype, fastpath=True)
604604

605605
@classmethod
606-
def from_codes(cls, codes, categories, ordered=False):
606+
def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
607607
"""
608-
Make a Categorical type from codes and categories arrays.
608+
Make a Categorical type from codes and categories or dtype.
609609
610-
This constructor is useful if you already have codes and categories and
611-
so do not need the (computation intensive) factorization step, which is
612-
usually done on the constructor.
610+
This constructor is useful if you already have codes and
611+
categories/dtype and so do not need the (computation intensive)
612+
factorization step, which is usually done on the constructor.
613613
614614
If your data does not follow this convention, please use the normal
615615
constructor.
@@ -618,16 +618,38 @@ def from_codes(cls, codes, categories, ordered=False):
618618
----------
619619
codes : array-like, integers
620620
An integer array, where each integer points to a category in
621-
categories or -1 for NaN
622-
categories : index-like
621+
categories or dtype.categories, or else is -1 for NaN
622+
categories : index-like, optional
623623
The categories for the categorical. Items need to be unique.
624-
ordered : boolean, (default False)
625-
Whether or not this categorical is treated as a ordered
626-
categorical. If not given, the resulting categorical will be
627-
unordered.
628-
"""
629-
dtype = CategoricalDtype._from_values_or_dtype(codes, categories,
630-
ordered)
624+
If the categories are not given here, then they must be provided
625+
in `dtype`.
626+
ordered : bool, optional
627+
Whether or not this categorical is treated as an ordered
628+
categorical. If not given here or in `dtype`, the resulting
629+
categorical will be unordered.
630+
dtype : CategoricalDtype or the string "category", optional
631+
If :class:`CategoricalDtype`, cannot be used together with
632+
`categories` or `ordered`.
633+
634+
.. versionadded:: 0.24.0
635+
636+
When `dtype` is provided, neither `categories` nor `ordered`
637+
should be provided.
638+
639+
Examples
640+
--------
641+
>>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
642+
>>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
643+
[a, b, a, b]
644+
Categories (2, object): [a < b]
645+
"""
646+
dtype = CategoricalDtype._from_values_or_dtype(categories=categories,
647+
ordered=ordered,
648+
dtype=dtype)
649+
if dtype.categories is None:
650+
msg = ("The categories must be provided in 'categories' or "
651+
"'dtype'. Both were None.")
652+
raise ValueError(msg)
631653

632654
codes = np.asarray(codes) # #21767
633655
if not is_integer_dtype(codes):
@@ -642,12 +664,6 @@ def from_codes(cls, codes, categories, ordered=False):
642664
if msg:
643665
raise ValueError(msg)
644666

645-
try:
646-
codes = coerce_indexer_dtype(codes, categories)
647-
except (ValueError, TypeError):
648-
raise ValueError(
649-
"codes need to be convertible to an arrays of integers")
650-
651667
if len(codes) and (
652668
codes.max() >= len(dtype.categories) or codes.min() < -1):
653669
raise ValueError("codes need to be between -1 and "
@@ -1265,8 +1281,7 @@ def shift(self, periods, fill_value=None):
12651281
else:
12661282
codes[periods:] = fill_value
12671283

1268-
return self.from_codes(codes, categories=self.categories,
1269-
ordered=self.ordered)
1284+
return self.from_codes(codes, dtype=self.dtype)
12701285

12711286
def __array__(self, dtype=None):
12721287
"""
@@ -1887,9 +1902,7 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
18871902

18881903
codes = take(self._codes, indexer, allow_fill=allow_fill,
18891904
fill_value=fill_value)
1890-
result = type(self).from_codes(codes,
1891-
categories=dtype.categories,
1892-
ordered=dtype.ordered)
1905+
result = type(self).from_codes(codes, dtype=dtype)
18931906
return result
18941907

18951908
take = take_nd
@@ -2078,9 +2091,7 @@ def __setitem__(self, key, value):
20782091
new_codes = _recode_for_categories(
20792092
value.codes, value.categories, self.categories
20802093
)
2081-
value = Categorical.from_codes(new_codes,
2082-
categories=self.categories,
2083-
ordered=self.ordered)
2094+
value = Categorical.from_codes(new_codes, dtype=self.dtype)
20842095

20852096
rvalue = value if is_list_like(value) else [value]
20862097

pandas/core/indexes/category.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,7 @@ def _create_from_codes(self, codes, dtype=None, name=None):
148148
dtype = self.dtype
149149
if name is None:
150150
name = self.name
151-
cat = Categorical.from_codes(codes, categories=dtype.categories,
152-
ordered=dtype.ordered)
151+
cat = Categorical.from_codes(codes, dtype=dtype)
153152
return CategoricalIndex(cat, name=name)
154153

155154
@classmethod

pandas/tests/arrays/categorical/test_constructors.py

+82-40
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,9 @@ def test_constructor_unsortable(self):
7777
assert not factor.ordered
7878

7979
# this however will raise as cannot be sorted
80-
with pytest.raises(TypeError):
80+
msg = ("'values' is not ordered, please explicitly specify the "
81+
"categories order by passing in a categories argument.")
82+
with pytest.raises(TypeError, match=msg):
8183
Categorical(arr, ordered=True)
8284

8385
def test_constructor_interval(self):
@@ -99,10 +101,11 @@ def test_constructor(self):
99101
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
100102

101103
# categories must be unique
102-
with pytest.raises(ValueError):
104+
msg = "Categorical categories must be unique"
105+
with pytest.raises(ValueError, match=msg):
103106
Categorical([1, 2], [1, 2, 2])
104107

105-
with pytest.raises(ValueError):
108+
with pytest.raises(ValueError, match=msg):
106109
Categorical(["a", "b"], ["a", "b", "b"])
107110

108111
# The default should be unordered
@@ -211,21 +214,23 @@ def test_constructor(self):
211214

212215
def test_constructor_not_sequence(self):
213216
# https://github.com/pandas-dev/pandas/issues/16022
214-
with pytest.raises(TypeError):
217+
msg = r"^Parameter 'categories' must be list-like, was"
218+
with pytest.raises(TypeError, match=msg):
215219
Categorical(['a', 'b'], categories='a')
216220

217221
def test_constructor_with_null(self):
218222

219223
# Cannot have NaN in categories
220-
with pytest.raises(ValueError):
224+
msg = "Categorial categories cannot be null"
225+
with pytest.raises(ValueError, match=msg):
221226
Categorical([np.nan, "a", "b", "c"],
222227
categories=[np.nan, "a", "b", "c"])
223228

224-
with pytest.raises(ValueError):
229+
with pytest.raises(ValueError, match=msg):
225230
Categorical([None, "a", "b", "c"],
226231
categories=[None, "a", "b", "c"])
227232

228-
with pytest.raises(ValueError):
233+
with pytest.raises(ValueError, match=msg):
229234
Categorical(DatetimeIndex(['nat', '20160101']),
230235
categories=[NaT, Timestamp('20160101')])
231236

@@ -347,13 +352,14 @@ def test_constructor_with_dtype(self, ordered):
347352

348353
def test_constructor_dtype_and_others_raises(self):
349354
dtype = CategoricalDtype(['a', 'b'], ordered=True)
350-
with pytest.raises(ValueError, match="Cannot"):
355+
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
356+
with pytest.raises(ValueError, match=msg):
351357
Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype)
352358

353-
with pytest.raises(ValueError, match="Cannot"):
359+
with pytest.raises(ValueError, match=msg):
354360
Categorical(['a', 'b'], ordered=True, dtype=dtype)
355361

356-
with pytest.raises(ValueError, match="Cannot"):
362+
with pytest.raises(ValueError, match=msg):
357363
Categorical(['a', 'b'], ordered=False, dtype=dtype)
358364

359365
@pytest.mark.parametrize('categories', [
@@ -417,33 +423,44 @@ def test_constructor_with_categorical_categories(self):
417423
def test_from_codes(self):
418424

419425
# too few categories
420-
with pytest.raises(ValueError):
421-
Categorical.from_codes([1, 2], [1, 2])
426+
dtype = CategoricalDtype(categories=[1, 2])
427+
msg = "codes need to be between "
428+
with pytest.raises(ValueError, match=msg):
429+
Categorical.from_codes([1, 2], categories=dtype.categories)
430+
with pytest.raises(ValueError, match=msg):
431+
Categorical.from_codes([1, 2], dtype=dtype)
422432

423433
# no int codes
424-
with pytest.raises(ValueError):
425-
Categorical.from_codes(["a"], [1, 2])
434+
msg = "codes need to be array-like integers"
435+
with pytest.raises(ValueError, match=msg):
436+
Categorical.from_codes(["a"], categories=dtype.categories)
437+
with pytest.raises(ValueError, match=msg):
438+
Categorical.from_codes(["a"], dtype=dtype)
426439

427440
# no unique categories
428-
with pytest.raises(ValueError):
429-
Categorical.from_codes([0, 1, 2], ["a", "a", "b"])
441+
with pytest.raises(ValueError,
442+
match="Categorical categories must be unique"):
443+
Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])
430444

431445
# NaN categories included
432-
with pytest.raises(ValueError):
433-
Categorical.from_codes([0, 1, 2], ["a", "b", np.nan])
446+
with pytest.raises(ValueError,
447+
match="Categorial categories cannot be null"):
448+
Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])
434449

435450
# too negative
436-
with pytest.raises(ValueError):
437-
Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
451+
dtype = CategoricalDtype(categories=["a", "b", "c"])
452+
msg = r"codes need to be between -1 and len\(categories\)-1"
453+
with pytest.raises(ValueError, match=msg):
454+
Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
455+
with pytest.raises(ValueError, match=msg):
456+
Categorical.from_codes([-2, 1, 2], dtype=dtype)
438457

439458
exp = Categorical(["a", "b", "c"], ordered=False)
440-
res = Categorical.from_codes([0, 1, 2], ["a", "b", "c"])
459+
res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
441460
tm.assert_categorical_equal(exp, res)
442461

443-
# Not available in earlier numpy versions
444-
if hasattr(np.random, "choice"):
445-
codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
446-
Categorical.from_codes(codes, categories=["train", "test"])
462+
res = Categorical.from_codes([0, 1, 2], dtype=dtype)
463+
tm.assert_categorical_equal(exp, res)
447464

448465
def test_from_codes_with_categorical_categories(self):
449466
# GH17884
@@ -458,28 +475,56 @@ def test_from_codes_with_categorical_categories(self):
458475
tm.assert_categorical_equal(result, expected)
459476

460477
# non-unique Categorical still raises
461-
with pytest.raises(ValueError):
478+
with pytest.raises(ValueError,
479+
match="Categorical categories must be unique"):
462480
Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a']))
463481

464482
def test_from_codes_with_nan_code(self):
465483
# GH21767
466484
codes = [1, 2, np.nan]
467-
categories = ['a', 'b', 'c']
468-
with pytest.raises(ValueError):
469-
Categorical.from_codes(codes, categories)
485+
dtype = CategoricalDtype(categories=['a', 'b', 'c'])
486+
with pytest.raises(ValueError,
487+
match="codes need to be array-like integers"):
488+
Categorical.from_codes(codes, categories=dtype.categories)
489+
with pytest.raises(ValueError,
490+
match="codes need to be array-like integers"):
491+
Categorical.from_codes(codes, dtype=dtype)
470492

471493
def test_from_codes_with_float(self):
472494
# GH21767
473495
codes = [1.0, 2.0, 0] # integer, but in float dtype
474-
categories = ['a', 'b', 'c']
496+
dtype = CategoricalDtype(categories=['a', 'b', 'c'])
497+
498+
with tm.assert_produces_warning(FutureWarning):
499+
cat = Categorical.from_codes(codes, dtype.categories)
500+
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))
475501

476502
with tm.assert_produces_warning(FutureWarning):
477-
cat = Categorical.from_codes(codes, categories)
503+
cat = Categorical.from_codes(codes, dtype=dtype)
478504
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))
479505

480506
codes = [1.1, 2.0, 0] # non-integer
481-
with pytest.raises(ValueError):
482-
Categorical.from_codes(codes, categories)
507+
with pytest.raises(ValueError,
508+
match="codes need to be array-like integers"):
509+
Categorical.from_codes(codes, dtype.categories)
510+
with pytest.raises(ValueError,
511+
match="codes need to be array-like integers"):
512+
Categorical.from_codes(codes, dtype=dtype)
513+
514+
def test_from_codes_with_dtype_raises(self):
515+
msg = 'Cannot specify'
516+
with pytest.raises(ValueError, match=msg):
517+
Categorical.from_codes([0, 1], categories=['a', 'b'],
518+
dtype=CategoricalDtype(['a', 'b']))
519+
520+
with pytest.raises(ValueError, match=msg):
521+
Categorical.from_codes([0, 1], ordered=True,
522+
dtype=CategoricalDtype(['a', 'b']))
523+
524+
def test_from_codes_neither(self):
525+
msg = "Both were None"
526+
with pytest.raises(ValueError, match=msg):
527+
Categorical.from_codes([0, 1])
483528

484529
@pytest.mark.parametrize('dtype', [None, 'category'])
485530
def test_from_inferred_categories(self, dtype):
@@ -515,14 +560,11 @@ def test_from_inferred_categories_coerces(self):
515560
expected = Categorical([1, 1, 2, np.nan])
516561
tm.assert_categorical_equal(result, expected)
517562

518-
def test_construction_with_ordered(self):
563+
@pytest.mark.parametrize('ordered', [None, True, False])
564+
def test_construction_with_ordered(self, ordered):
519565
# GH 9347, 9190
520-
cat = Categorical([0, 1, 2])
521-
assert not cat.ordered
522-
cat = Categorical([0, 1, 2], ordered=False)
523-
assert not cat.ordered
524-
cat = Categorical([0, 1, 2], ordered=True)
525-
assert cat.ordered
566+
cat = Categorical([0, 1, 2], ordered=ordered)
567+
assert cat.ordered == bool(ordered)
526568

527569
@pytest.mark.xfail(reason="Imaginary values not supported in Categorical")
528570
def test_constructor_imaginary(self):

pandas/tests/indexes/test_category.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ def test_construction_with_categorical_dtype(self):
158158
tm.assert_index_equal(result, expected, exact=True)
159159

160160
# error when combining categories/ordered and dtype kwargs
161-
msg = 'Cannot specify `categories` or `ordered` together with `dtype`.'
161+
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
162162
with pytest.raises(ValueError, match=msg):
163163
CategoricalIndex(data, categories=cats, dtype=dtype)
164164

0 commit comments

Comments
 (0)