Skip to content

Commit db48799

Browse files
authored
BUG: repr of Categorical does not distinguish int and str. (#34222)
1 parent 314ac9a commit db48799

File tree

15 files changed

+138
-114
lines changed

15 files changed

+138
-114
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -851,6 +851,7 @@ Categorical
851851
- Bug when passing categorical data to :class:`Index` constructor along with ``dtype=object`` incorrectly returning a :class:`CategoricalIndex` instead of object-dtype :class:`Index` (:issue:`32167`)
852852
- Bug where :class:`Categorical` comparison operator ``__ne__`` would incorrectly evaluate to ``False`` when either element was missing (:issue:`32276`)
853853
- :meth:`Categorical.fillna` now accepts :class:`Categorical` ``other`` argument (:issue:`32420`)
854+
- Repr of :class:`Categorical` was not distinguishing between int and str (:issue:`33676`)
854855

855856
Datetimelike
856857
^^^^^^^^^^^^

pandas/core/algorithms.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -604,8 +604,8 @@ def factorize(
604604
>>> codes
605605
array([0, 0, 1]...)
606606
>>> uniques
607-
[a, c]
608-
Categories (3, object): [a, b, c]
607+
['a', 'c']
608+
Categories (3, object): ['a', 'b', 'c']
609609
610610
Notice that ``'b'`` is in ``uniques.categories``, despite not being
611611
present in ``cat.values``.

pandas/core/arrays/base.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -846,14 +846,14 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"
846846
--------
847847
>>> cat = pd.Categorical(['a', 'b', 'c'])
848848
>>> cat
849-
[a, b, c]
850-
Categories (3, object): [a, b, c]
849+
['a', 'b', 'c']
850+
Categories (3, object): ['a', 'b', 'c']
851851
>>> cat.repeat(2)
852-
[a, a, b, b, c, c]
853-
Categories (3, object): [a, b, c]
852+
['a', 'a', 'b', 'b', 'c', 'c']
853+
Categories (3, object): ['a', 'b', 'c']
854854
>>> cat.repeat([1, 2, 3])
855-
[a, b, b, c, c, c]
856-
Categories (3, object): [a, b, c]
855+
['a', 'b', 'b', 'c', 'c', 'c']
856+
Categories (3, object): ['a', 'b', 'c']
857857
"""
858858

859859
@Substitution(klass="ExtensionArray")

pandas/core/arrays/categorical.py

+55-50
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from csv import QUOTE_NONNUMERIC
2+
from functools import partial
13
import operator
24
from shutil import get_terminal_size
35
from typing import Dict, Hashable, List, Type, Union, cast
@@ -275,17 +277,17 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject):
275277
Categories (3, int64): [1, 2, 3]
276278
277279
>>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
278-
[a, b, c, a, b, c]
279-
Categories (3, object): [a, b, c]
280+
['a', 'b', 'c', 'a', 'b', 'c']
281+
Categories (3, object): ['a', 'b', 'c']
280282
281283
Ordered `Categoricals` can be sorted according to the custom order
282284
of the categories and can have a min and max value.
283285
284286
>>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,
285287
... categories=['c', 'b', 'a'])
286288
>>> c
287-
[a, b, c, a, b, c]
288-
Categories (3, object): [c < b < a]
289+
['a', 'b', 'c', 'a', 'b', 'c']
290+
Categories (3, object): ['c' < 'b' < 'a']
289291
>>> c.min()
290292
'c'
291293
"""
@@ -598,8 +600,8 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
598600
--------
599601
>>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
600602
>>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
601-
[a, b, a, b]
602-
Categories (2, object): [a < b]
603+
['a', 'b', 'a', 'b']
604+
Categories (2, object): ['a' < 'b']
603605
"""
604606
dtype = CategoricalDtype._from_values_or_dtype(
605607
categories=categories, ordered=ordered, dtype=dtype
@@ -659,13 +661,13 @@ def _set_categories(self, categories, fastpath=False):
659661
--------
660662
>>> c = pd.Categorical(['a', 'b'])
661663
>>> c
662-
[a, b]
663-
Categories (2, object): [a, b]
664+
['a', 'b']
665+
Categories (2, object): ['a', 'b']
664666
665667
>>> c._set_categories(pd.Index(['a', 'c']))
666668
>>> c
667-
[a, c]
668-
Categories (2, object): [a, c]
669+
['a', 'c']
670+
Categories (2, object): ['a', 'c']
669671
"""
670672
if fastpath:
671673
new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)
@@ -885,14 +887,14 @@ def rename_categories(self, new_categories, inplace=False):
885887
categories not in the dictionary are passed through
886888
887889
>>> c.rename_categories({'a': 'A', 'c': 'C'})
888-
[A, A, b]
889-
Categories (2, object): [A, b]
890+
['A', 'A', 'b']
891+
Categories (2, object): ['A', 'b']
890892
891893
You may also provide a callable to create the new categories
892894
893895
>>> c.rename_categories(lambda x: x.upper())
894-
[A, A, B]
895-
Categories (2, object): [A, B]
896+
['A', 'A', 'B']
897+
Categories (2, object): ['A', 'B']
896898
"""
897899
inplace = validate_bool_kwarg(inplace, "inplace")
898900
cat = self if inplace else self.copy()
@@ -1128,22 +1130,22 @@ def map(self, mapper):
11281130
--------
11291131
>>> cat = pd.Categorical(['a', 'b', 'c'])
11301132
>>> cat
1131-
[a, b, c]
1132-
Categories (3, object): [a, b, c]
1133+
['a', 'b', 'c']
1134+
Categories (3, object): ['a', 'b', 'c']
11331135
>>> cat.map(lambda x: x.upper())
1134-
[A, B, C]
1135-
Categories (3, object): [A, B, C]
1136+
['A', 'B', 'C']
1137+
Categories (3, object): ['A', 'B', 'C']
11361138
>>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
1137-
[first, second, third]
1138-
Categories (3, object): [first, second, third]
1139+
['first', 'second', 'third']
1140+
Categories (3, object): ['first', 'second', 'third']
11391141
11401142
If the mapping is one-to-one the ordering of the categories is
11411143
preserved:
11421144
11431145
>>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)
11441146
>>> cat
1145-
[a, b, c]
1146-
Categories (3, object): [a < b < c]
1147+
['a', 'b', 'c']
1148+
Categories (3, object): ['a' < 'b' < 'c']
11471149
>>> cat.map({'a': 3, 'b': 2, 'c': 1})
11481150
[3, 2, 1]
11491151
Categories (3, int64): [3 < 2 < 1]
@@ -1778,29 +1780,29 @@ def take(self: _T, indexer, allow_fill: bool = False, fill_value=None) -> _T:
17781780
--------
17791781
>>> cat = pd.Categorical(['a', 'a', 'b'])
17801782
>>> cat
1781-
[a, a, b]
1782-
Categories (2, object): [a, b]
1783+
['a', 'a', 'b']
1784+
Categories (2, object): ['a', 'b']
17831785
17841786
Specify ``allow_fill==False`` to have negative indices mean indexing
17851787
from the right.
17861788
17871789
>>> cat.take([0, -1, -2], allow_fill=False)
1788-
[a, b, a]
1789-
Categories (2, object): [a, b]
1790+
['a', 'b', 'a']
1791+
Categories (2, object): ['a', 'b']
17901792
17911793
With ``allow_fill=True``, indices equal to ``-1`` mean "missing"
17921794
values that should be filled with the `fill_value`, which is
17931795
``np.nan`` by default.
17941796
17951797
>>> cat.take([0, -1, -1], allow_fill=True)
1796-
[a, NaN, NaN]
1797-
Categories (2, object): [a, b]
1798+
['a', NaN, NaN]
1799+
Categories (2, object): ['a', 'b']
17981800
17991801
The fill value can be specified.
18001802
18011803
>>> cat.take([0, -1, -1], allow_fill=True, fill_value='a')
1802-
[a, a, a]
1803-
Categories (2, object): [a, b]
1804+
['a', 'a', 'a']
1805+
Categories (2, object): ['a', 'b']
18041806
18051807
Specifying a fill value that's not in ``self.categories``
18061808
will raise a ``ValueError``.
@@ -1872,13 +1874,16 @@ def _repr_categories(self):
18721874
)
18731875
from pandas.io.formats import format as fmt
18741876

1877+
format_array = partial(
1878+
fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
1879+
)
18751880
if len(self.categories) > max_categories:
18761881
num = max_categories // 2
1877-
head = fmt.format_array(self.categories[:num], None)
1878-
tail = fmt.format_array(self.categories[-num:], None)
1882+
head = format_array(self.categories[:num])
1883+
tail = format_array(self.categories[-num:])
18791884
category_strs = head + ["..."] + tail
18801885
else:
1881-
category_strs = fmt.format_array(self.categories, None)
1886+
category_strs = format_array(self.categories)
18821887

18831888
# Strip all leading spaces, which format_array adds for columns...
18841889
category_strs = [x.strip() for x in category_strs]
@@ -2051,8 +2056,8 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
20512056
--------
20522057
>>> c = pd.Categorical(list('aabca'))
20532058
>>> c
2054-
[a, a, b, c, a]
2055-
Categories (3, object): [a, b, c]
2059+
['a', 'a', 'b', 'c', 'a']
2060+
Categories (3, object): ['a', 'b', 'c']
20562061
>>> c.categories
20572062
Index(['a', 'b', 'c'], dtype='object')
20582063
>>> c.codes
@@ -2199,20 +2204,20 @@ def unique(self):
21992204
order of appearance.
22002205
22012206
>>> pd.Categorical(list("baabc")).unique()
2202-
[b, a, c]
2203-
Categories (3, object): [b, a, c]
2207+
['b', 'a', 'c']
2208+
Categories (3, object): ['b', 'a', 'c']
22042209
22052210
>>> pd.Categorical(list("baabc"), categories=list("abc")).unique()
2206-
[b, a, c]
2207-
Categories (3, object): [b, a, c]
2211+
['b', 'a', 'c']
2212+
Categories (3, object): ['b', 'a', 'c']
22082213
22092214
An ordered Categorical preserves the category ordering.
22102215
22112216
>>> pd.Categorical(
22122217
... list("baabc"), categories=list("abc"), ordered=True
22132218
... ).unique()
2214-
[b, a, c]
2215-
Categories (3, object): [a < b < c]
2219+
['b', 'a', 'c']
2220+
Categories (3, object): ['a' < 'b' < 'c']
22162221
"""
22172222
# unlike np.unique, unique1d does not sort
22182223
unique_codes = unique1d(self.codes)
@@ -2465,7 +2470,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
24652470
4 c
24662471
5 c
24672472
dtype: category
2468-
Categories (3, object): [a, b, c]
2473+
Categories (3, object): ['a', 'b', 'c']
24692474
24702475
>>> s.cat.categories
24712476
Index(['a', 'b', 'c'], dtype='object')
@@ -2478,7 +2483,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
24782483
4 a
24792484
5 a
24802485
dtype: category
2481-
Categories (3, object): [c, b, a]
2486+
Categories (3, object): ['c', 'b', 'a']
24822487
24832488
>>> s.cat.reorder_categories(list("cba"))
24842489
0 a
@@ -2488,7 +2493,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
24882493
4 c
24892494
5 c
24902495
dtype: category
2491-
Categories (3, object): [c, b, a]
2496+
Categories (3, object): ['c', 'b', 'a']
24922497
24932498
>>> s.cat.add_categories(["d", "e"])
24942499
0 a
@@ -2498,7 +2503,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
24982503
4 c
24992504
5 c
25002505
dtype: category
2501-
Categories (5, object): [a, b, c, d, e]
2506+
Categories (5, object): ['a', 'b', 'c', 'd', 'e']
25022507
25032508
>>> s.cat.remove_categories(["a", "c"])
25042509
0 NaN
@@ -2508,7 +2513,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
25082513
4 NaN
25092514
5 NaN
25102515
dtype: category
2511-
Categories (1, object): [b]
2516+
Categories (1, object): ['b']
25122517
25132518
>>> s1 = s.cat.add_categories(["d", "e"])
25142519
>>> s1.cat.remove_unused_categories()
@@ -2519,7 +2524,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
25192524
4 c
25202525
5 c
25212526
dtype: category
2522-
Categories (3, object): [a, b, c]
2527+
Categories (3, object): ['a', 'b', 'c']
25232528
25242529
>>> s.cat.set_categories(list("abcde"))
25252530
0 a
@@ -2529,7 +2534,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
25292534
4 c
25302535
5 c
25312536
dtype: category
2532-
Categories (5, object): [a, b, c, d, e]
2537+
Categories (5, object): ['a', 'b', 'c', 'd', 'e']
25332538
25342539
>>> s.cat.as_ordered()
25352540
0 a
@@ -2539,7 +2544,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
25392544
4 c
25402545
5 c
25412546
dtype: category
2542-
Categories (3, object): [a < b < c]
2547+
Categories (3, object): ['a' < 'b' < 'c']
25432548
25442549
>>> s.cat.as_unordered()
25452550
0 a
@@ -2549,7 +2554,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
25492554
4 c
25502555
5 c
25512556
dtype: category
2552-
Categories (3, object): [a, b, c]
2557+
Categories (3, object): ['a', 'b', 'c']
25532558
"""
25542559

25552560
def __init__(self, data):

pandas/core/base.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -743,8 +743,8 @@ def array(self) -> ExtensionArray:
743743
744744
>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
745745
>>> ser.array
746-
[a, b, a]
747-
Categories (2, object): [a, b]
746+
['a', 'b', 'a']
747+
Categories (2, object): ['a', 'b']
748748
"""
749749
raise AbstractMethodError(self)
750750

@@ -1481,8 +1481,8 @@ def factorize(self, sort=False, na_sentinel=-1):
14811481
... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True
14821482
... )
14831483
>>> ser
1484-
[apple, bread, bread, cheese, milk]
1485-
Categories (4, object): [apple < bread < cheese < milk]
1484+
['apple', 'bread', 'bread', 'cheese', 'milk']
1485+
Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']
14861486
14871487
>>> ser.searchsorted('bread')
14881488
1

pandas/core/construction.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -217,15 +217,15 @@ def array(
217217
You can use the string alias for `dtype`
218218
219219
>>> pd.array(['a', 'b', 'a'], dtype='category')
220-
[a, b, a]
221-
Categories (2, object): [a, b]
220+
['a', 'b', 'a']
221+
Categories (2, object): ['a', 'b']
222222
223223
Or specify the actual dtype
224224
225225
>>> pd.array(['a', 'b', 'a'],
226226
... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
227-
[a, b, a]
228-
Categories (3, object): [a < b < c]
227+
['a', 'b', 'a']
228+
Categories (3, object): ['a' < 'b' < 'c']
229229
230230
If pandas does not infer a dedicated extension type a
231231
:class:`arrays.PandasArray` is returned.
@@ -357,8 +357,8 @@ def extract_array(obj, extract_numpy: bool = False):
357357
Examples
358358
--------
359359
>>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category'))
360-
[a, b, c]
361-
Categories (3, object): [a, b, c]
360+
['a', 'b', 'c']
361+
Categories (3, object): ['a', 'b', 'c']
362362
363363
Other objects like lists, arrays, and DataFrames are just passed through.
364364

0 commit comments

Comments
 (0)