Skip to content

BUG: repr of Categorical does not distinguish int and str. #34222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jun 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -825,6 +825,7 @@ Categorical
- Bug when passing categorical data to :class:`Index` constructor along with ``dtype=object`` incorrectly returning a :class:`CategoricalIndex` instead of object-dtype :class:`Index` (:issue:`32167`)
- Bug where :class:`Categorical` comparison operator ``__ne__`` would incorrectly evaluate to ``False`` when either element was missing (:issue:`32276`)
- :meth:`Categorical.fillna` now accepts :class:`Categorical` ``other`` argument (:issue:`32420`)
- Repr of :class:`Categorical` was not distinguishing between int and str (:issue:`33676`)

Datetimelike
^^^^^^^^^^^^
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,8 +604,8 @@ def factorize(
>>> codes
array([0, 0, 1]...)
>>> uniques
[a, c]
Categories (3, object): [a, b, c]
['a', 'c']
Categories (3, object): ['a', 'b', 'c']

Notice that ``'b'`` is in ``uniques.categories``, despite not being
present in ``cat.values``.
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,14 +846,14 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"
--------
>>> cat = pd.Categorical(['a', 'b', 'c'])
>>> cat
[a, b, c]
Categories (3, object): [a, b, c]
['a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']
>>> cat.repeat(2)
[a, a, b, b, c, c]
Categories (3, object): [a, b, c]
['a', 'a', 'b', 'b', 'c', 'c']
Categories (3, object): ['a', 'b', 'c']
>>> cat.repeat([1, 2, 3])
[a, b, b, c, c, c]
Categories (3, object): [a, b, c]
['a', 'b', 'b', 'c', 'c', 'c']
Categories (3, object): ['a', 'b', 'c']
"""

@Substitution(klass="ExtensionArray")
Expand Down
105 changes: 55 additions & 50 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from csv import QUOTE_NONNUMERIC
from functools import partial
import operator
from shutil import get_terminal_size
from typing import Dict, Hashable, List, Type, Union, cast
Expand Down Expand Up @@ -275,17 +277,17 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject):
Categories (3, int64): [1, 2, 3]

>>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
[a, b, c, a, b, c]
Categories (3, object): [a, b, c]
['a', 'b', 'c', 'a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']

Ordered `Categoricals` can be sorted according to the custom order
of the categories and can have a min and max value.

>>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,
... categories=['c', 'b', 'a'])
>>> c
[a, b, c, a, b, c]
Categories (3, object): [c < b < a]
['a', 'b', 'c', 'a', 'b', 'c']
Categories (3, object): ['c' < 'b' < 'a']
>>> c.min()
'c'
"""
Expand Down Expand Up @@ -598,8 +600,8 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
--------
>>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
>>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
[a, b, a, b]
Categories (2, object): [a < b]
['a', 'b', 'a', 'b']
Categories (2, object): ['a' < 'b']
"""
dtype = CategoricalDtype._from_values_or_dtype(
categories=categories, ordered=ordered, dtype=dtype
Expand Down Expand Up @@ -659,13 +661,13 @@ def _set_categories(self, categories, fastpath=False):
--------
>>> c = pd.Categorical(['a', 'b'])
>>> c
[a, b]
Categories (2, object): [a, b]
['a', 'b']
Categories (2, object): ['a', 'b']

>>> c._set_categories(pd.Index(['a', 'c']))
>>> c
[a, c]
Categories (2, object): [a, c]
['a', 'c']
Categories (2, object): ['a', 'c']
"""
if fastpath:
new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)
Expand Down Expand Up @@ -885,14 +887,14 @@ def rename_categories(self, new_categories, inplace=False):
categories not in the dictionary are passed through

>>> c.rename_categories({'a': 'A', 'c': 'C'})
[A, A, b]
Categories (2, object): [A, b]
['A', 'A', 'b']
Categories (2, object): ['A', 'b']

You may also provide a callable to create the new categories

>>> c.rename_categories(lambda x: x.upper())
[A, A, B]
Categories (2, object): [A, B]
['A', 'A', 'B']
Categories (2, object): ['A', 'B']
"""
inplace = validate_bool_kwarg(inplace, "inplace")
cat = self if inplace else self.copy()
Expand Down Expand Up @@ -1128,22 +1130,22 @@ def map(self, mapper):
--------
>>> cat = pd.Categorical(['a', 'b', 'c'])
>>> cat
[a, b, c]
Categories (3, object): [a, b, c]
['a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']
>>> cat.map(lambda x: x.upper())
[A, B, C]
Categories (3, object): [A, B, C]
['A', 'B', 'C']
Categories (3, object): ['A', 'B', 'C']
>>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
[first, second, third]
Categories (3, object): [first, second, third]
['first', 'second', 'third']
Categories (3, object): ['first', 'second', 'third']

If the mapping is one-to-one the ordering of the categories is
preserved:

>>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)
>>> cat
[a, b, c]
Categories (3, object): [a < b < c]
['a', 'b', 'c']
Categories (3, object): ['a' < 'b' < 'c']
>>> cat.map({'a': 3, 'b': 2, 'c': 1})
[3, 2, 1]
Categories (3, int64): [3 < 2 < 1]
Expand Down Expand Up @@ -1778,29 +1780,29 @@ def take(self: _T, indexer, allow_fill: bool = False, fill_value=None) -> _T:
--------
>>> cat = pd.Categorical(['a', 'a', 'b'])
>>> cat
[a, a, b]
Categories (2, object): [a, b]
['a', 'a', 'b']
Categories (2, object): ['a', 'b']

Specify ``allow_fill==False`` to have negative indices mean indexing
from the right.

>>> cat.take([0, -1, -2], allow_fill=False)
[a, b, a]
Categories (2, object): [a, b]
['a', 'b', 'a']
Categories (2, object): ['a', 'b']

With ``allow_fill=True``, indices equal to ``-1`` mean "missing"
values that should be filled with the `fill_value`, which is
``np.nan`` by default.

>>> cat.take([0, -1, -1], allow_fill=True)
[a, NaN, NaN]
Categories (2, object): [a, b]
['a', NaN, NaN]
Categories (2, object): ['a', 'b']

The fill value can be specified.

>>> cat.take([0, -1, -1], allow_fill=True, fill_value='a')
[a, a, a]
Categories (2, object): [a, b]
['a', 'a', 'a']
Categories (2, object): ['a', 'b']

Specifying a fill value that's not in ``self.categories``
will raise a ``ValueError``.
Expand Down Expand Up @@ -1872,13 +1874,16 @@ def _repr_categories(self):
)
from pandas.io.formats import format as fmt

format_array = partial(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since you changed this in pandas/io/format.py is it also necessary here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's necessary so that GenericArrayFormatter is initialised with it

fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
)
if len(self.categories) > max_categories:
num = max_categories // 2
head = fmt.format_array(self.categories[:num], None)
tail = fmt.format_array(self.categories[-num:], None)
head = format_array(self.categories[:num])
tail = format_array(self.categories[-num:])
category_strs = head + ["..."] + tail
else:
category_strs = fmt.format_array(self.categories, None)
category_strs = format_array(self.categories)

# Strip all leading spaces, which format_array adds for columns...
category_strs = [x.strip() for x in category_strs]
Expand Down Expand Up @@ -2051,8 +2056,8 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
--------
>>> c = pd.Categorical(list('aabca'))
>>> c
[a, a, b, c, a]
Categories (3, object): [a, b, c]
['a', 'a', 'b', 'c', 'a']
Categories (3, object): ['a', 'b', 'c']
>>> c.categories
Index(['a', 'b', 'c'], dtype='object')
>>> c.codes
Expand Down Expand Up @@ -2199,20 +2204,20 @@ def unique(self):
order of appearance.

>>> pd.Categorical(list("baabc")).unique()
[b, a, c]
Categories (3, object): [b, a, c]
['b', 'a', 'c']
Categories (3, object): ['b', 'a', 'c']

>>> pd.Categorical(list("baabc"), categories=list("abc")).unique()
[b, a, c]
Categories (3, object): [b, a, c]
['b', 'a', 'c']
Categories (3, object): ['b', 'a', 'c']

An ordered Categorical preserves the category ordering.

>>> pd.Categorical(
... list("baabc"), categories=list("abc"), ordered=True
... ).unique()
[b, a, c]
Categories (3, object): [a < b < c]
['b', 'a', 'c']
Categories (3, object): ['a' < 'b' < 'c']
"""
# unlike np.unique, unique1d does not sort
unique_codes = unique1d(self.codes)
Expand Down Expand Up @@ -2465,7 +2470,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']

>>> s.cat.categories
Index(['a', 'b', 'c'], dtype='object')
Expand All @@ -2478,7 +2483,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 a
5 a
dtype: category
Categories (3, object): [c, b, a]
Categories (3, object): ['c', 'b', 'a']

>>> s.cat.reorder_categories(list("cba"))
0 a
Expand All @@ -2488,7 +2493,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [c, b, a]
Categories (3, object): ['c', 'b', 'a']

>>> s.cat.add_categories(["d", "e"])
0 a
Expand All @@ -2498,7 +2503,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (5, object): [a, b, c, d, e]
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

>>> s.cat.remove_categories(["a", "c"])
0 NaN
Expand All @@ -2508,7 +2513,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 NaN
5 NaN
dtype: category
Categories (1, object): [b]
Categories (1, object): ['b']

>>> s1 = s.cat.add_categories(["d", "e"])
>>> s1.cat.remove_unused_categories()
Expand All @@ -2519,7 +2524,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']

>>> s.cat.set_categories(list("abcde"))
0 a
Expand All @@ -2529,7 +2534,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (5, object): [a, b, c, d, e]
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

>>> s.cat.as_ordered()
0 a
Expand All @@ -2539,7 +2544,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [a < b < c]
Categories (3, object): ['a' < 'b' < 'c']

>>> s.cat.as_unordered()
0 a
Expand All @@ -2549,7 +2554,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']
"""

def __init__(self, data):
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,8 +743,8 @@ def array(self) -> ExtensionArray:

>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
>>> ser.array
[a, b, a]
Categories (2, object): [a, b]
['a', 'b', 'a']
Categories (2, object): ['a', 'b']
"""
raise AbstractMethodError(self)

Expand Down Expand Up @@ -1481,8 +1481,8 @@ def factorize(self, sort=False, na_sentinel=-1):
... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True
... )
>>> ser
[apple, bread, bread, cheese, milk]
Categories (4, object): [apple < bread < cheese < milk]
['apple', 'bread', 'bread', 'cheese', 'milk']
Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']

>>> ser.searchsorted('bread')
1
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,15 +217,15 @@ def array(
You can use the string alias for `dtype`

>>> pd.array(['a', 'b', 'a'], dtype='category')
[a, b, a]
Categories (2, object): [a, b]
['a', 'b', 'a']
Categories (2, object): ['a', 'b']

Or specify the actual dtype

>>> pd.array(['a', 'b', 'a'],
... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
[a, b, a]
Categories (3, object): [a < b < c]
['a', 'b', 'a']
Categories (3, object): ['a' < 'b' < 'c']

If pandas does not infer a dedicated extension type a
:class:`arrays.PandasArray` is returned.
Expand Down Expand Up @@ -357,8 +357,8 @@ def extract_array(obj, extract_numpy: bool = False):
Examples
--------
>>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category'))
[a, b, c]
Categories (3, object): [a, b, c]
['a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']

Other objects like lists, arrays, and DataFrames are just passed through.

Expand Down
Loading