Skip to content

BUG: repr of Categorical does not distinguish int and str. #34222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jun 24, 2020
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -825,6 +825,7 @@ Categorical
- Bug when passing categorical data to :class:`Index` constructor along with ``dtype=object`` incorrectly returning a :class:`CategoricalIndex` instead of object-dtype :class:`Index` (:issue:`32167`)
- Bug where :class:`Categorical` comparison operator ``__ne__`` would incorrectly evaluate to ``False`` when either element was missing (:issue:`32276`)
- :meth:`Categorical.fillna` now accepts :class:`Categorical` ``other`` argument (:issue:`32420`)
- Repr of :class:`Categorical` was not distinguishing between int and str (:issue:`33676`)

Datetimelike
^^^^^^^^^^^^
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,8 +604,8 @@ def factorize(
>>> codes
array([0, 0, 1]...)
>>> uniques
[a, c]
Categories (3, object): [a, b, c]
['a', 'c']
Categories (3, object): ['a', 'b', 'c']

Notice that ``'b'`` is in ``uniques.categories``, despite not being
present in ``cat.values``.
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -847,13 +847,13 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"
>>> cat = pd.Categorical(['a', 'b', 'c'])
>>> cat
[a, b, c]
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']
>>> cat.repeat(2)
[a, a, b, b, c, c]
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']
>>> cat.repeat([1, 2, 3])
[a, b, b, c, c, c]
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']
"""

@Substitution(klass="ExtensionArray")
Expand Down
25 changes: 15 additions & 10 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from csv import QUOTE_NONNUMERIC
from functools import partial
import operator
from shutil import get_terminal_size
from typing import Dict, Hashable, List, Type, Union, cast
Expand Down Expand Up @@ -276,7 +278,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject):

>>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
[a, b, c, a, b, c]
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']

Ordered `Categoricals` can be sorted according to the custom order
of the categories and can have a min and max value.
Expand Down Expand Up @@ -1129,10 +1131,10 @@ def map(self, mapper):
>>> cat = pd.Categorical(['a', 'b', 'c'])
>>> cat
[a, b, c]
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']
>>> cat.map(lambda x: x.upper())
[A, B, C]
Categories (3, object): [A, B, C]
Categories (3, object): ['A', 'B', 'C']
>>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
[first, second, third]
Categories (3, object): [first, second, third]
Expand Down Expand Up @@ -1872,13 +1874,16 @@ def _repr_categories(self):
)
from pandas.io.formats import format as fmt

format_array = partial(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since you changed this in pandas/io/format.py is it also necessary here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's necessary so that GenericArrayFormatter is initialised with it

fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
)
if len(self.categories) > max_categories:
num = max_categories // 2
head = fmt.format_array(self.categories[:num], None)
tail = fmt.format_array(self.categories[-num:], None)
head = format_array(self.categories[:num])
tail = format_array(self.categories[-num:])
category_strs = head + ["..."] + tail
else:
category_strs = fmt.format_array(self.categories, None)
category_strs = format_array(self.categories)

# Strip all leading spaces, which format_array adds for columns...
category_strs = [x.strip() for x in category_strs]
Expand Down Expand Up @@ -2052,7 +2057,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
>>> c = pd.Categorical(list('aabca'))
>>> c
[a, a, b, c, a]
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']
>>> c.categories
Index(['a', 'b', 'c'], dtype='object')
>>> c.codes
Expand Down Expand Up @@ -2465,7 +2470,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']

>>> s.cat.categories
Index(['a', 'b', 'c'], dtype='object')
Expand Down Expand Up @@ -2519,7 +2524,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']

>>> s.cat.set_categories(list("abcde"))
0 a
Expand Down Expand Up @@ -2549,7 +2554,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
4 c
5 c
dtype: category
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']
"""

def __init__(self, data):
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,8 +743,8 @@ def array(self) -> ExtensionArray:

>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
>>> ser.array
[a, b, a]
Categories (2, object): [a, b]
['a', 'b', 'a']
Categories (2, object): ['a', 'b']
"""
raise AbstractMethodError(self)

Expand Down Expand Up @@ -1481,8 +1481,8 @@ def factorize(self, sort=False, na_sentinel=-1):
... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True
... )
>>> ser
[apple, bread, bread, cheese, milk]
Categories (4, object): [apple < bread < cheese < milk]
['apple', 'bread', 'bread', 'cheese', 'milk']
Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']

>>> ser.searchsorted('bread')
1
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,15 +217,15 @@ def array(
You can use the string alias for `dtype`

>>> pd.array(['a', 'b', 'a'], dtype='category')
[a, b, a]
Categories (2, object): [a, b]
['a', 'b', 'a']
Categories (2, object): ['a', 'b']

Or specify the actual dtype

>>> pd.array(['a', 'b', 'a'],
... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
[a, b, a]
Categories (3, object): [a < b < c]
['a', 'b', 'a']
Categories (3, object): ['a' < 'b' < 'c']

If pandas does not infer a dedicated extension type a
:class:`arrays.PandasArray` is returned.
Expand Down Expand Up @@ -357,8 +357,8 @@ def extract_array(obj, extract_numpy: bool = False):
Examples
--------
>>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category'))
[a, b, c]
Categories (3, object): [a, b, c]
['a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']

Other objects like lists, arrays, and DataFrames are just passed through.

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def union_categoricals(

>>> union_categoricals([a, b], sort_categories=True)
[b, c, a, b]
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']

`union_categoricals` also works with the case of combining two
categoricals of the same categories and order information (e.g. what
Expand Down Expand Up @@ -267,7 +267,7 @@ def union_categoricals(
>>> b = pd.Categorical(["c", "b", "a"], ordered=True)
>>> union_categoricals([a, b], ignore_order=True)
[a, b, c, c, b, a]
Categories (3, object): [a, b, c]
Categories (3, object): ['a', 'b', 'c']

`union_categoricals` also works with a `CategoricalIndex`, or `Series`
containing categorical data, but note that the resulting array will
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,8 +524,8 @@ def values(self):
array(['a', 'a', 'b', 'c'], dtype=object)

>>> pd.Series(list('aabc')).astype('category').values
[a, a, b, c]
Categories (3, object): [a, b, c]
['a', 'a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']

Timezone aware datetime data is converted to UTC:

Expand Down Expand Up @@ -1850,15 +1850,15 @@ def unique(self):
appearance.

>>> pd.Series(pd.Categorical(list('baabc'))).unique()
[b, a, c]
Categories (3, object): [b, a, c]
['b', 'a', 'c']
Categories (3, object): ['b', 'a', 'c']

An ordered Categorical preserves the category ordering.

>>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'),
... ordered=True)).unique()
[b, a, c]
Categories (3, object): [a < b < c]
['b', 'a', 'c']
Categories (3, object): ['a' < 'b' < 'c']
"""
result = super().unique()
return result
Expand Down
19 changes: 14 additions & 5 deletions pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

from contextlib import contextmanager
from csv import QUOTE_NONE, QUOTE_NONNUMERIC
from datetime import tzinfo
import decimal
from functools import partial
Expand Down Expand Up @@ -176,6 +177,7 @@ def __init__(
self.na_rep = na_rep
self.length = length
self.footer = footer
self.quoting = QUOTE_NONNUMERIC

def _get_footer(self) -> str:
footer = ""
Expand All @@ -200,6 +202,7 @@ def _get_formatted_values(self) -> List[str]:
None,
float_format=None,
na_rep=self.na_rep,
quoting=self.quoting,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

quoting always QUOTE_NONNUMERIC

)

def to_string(self) -> str:
Expand Down Expand Up @@ -1109,6 +1112,7 @@ def format_array(
justify: str = "right",
decimal: str = ".",
leading_space: Optional[bool] = None,
quoting: Optional[int] = None,
) -> List[str]:
"""
Format an array for printing.
Expand Down Expand Up @@ -1171,6 +1175,7 @@ def format_array(
justify=justify,
decimal=decimal,
leading_space=leading_space,
quoting=quoting,
)

return fmt_obj.get_result()
Expand Down Expand Up @@ -1216,11 +1221,15 @@ def _format_strings(self) -> List[str]:
else:
float_format = self.float_format

formatter = (
self.formatter
if self.formatter is not None
else (lambda x: pprint_thing(x, escape_chars=("\t", "\r", "\n")))
)
if self.formatter is not None:
formatter = self.formatter
else:
quote_strings = self.quoting is not None and self.quoting != QUOTE_NONE
formatter = partial(
pprint_thing,
escape_chars=("\t", "\r", "\n"),
quote_strings=quote_strings,
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as the ternary expression is getting more complex, an if else may now be more readable. maybe use partial instead of lambda and maybe move quote_strings assignment inside the relevant if else block.


def _format(x):
if self.na_rep is not None and is_scalar(x) and isna(x):
Expand Down
31 changes: 20 additions & 11 deletions pandas/tests/arrays/categorical/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

class TestCategoricalReprWithFactor(TestCategorical):
def test_print(self):
expected = ["[a, b, b, a, a, c, c, c]", "Categories (3, object): [a < b < c]"]
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, object): ['a' < 'b' < 'c']",
]
expected = "\n".join(expected)
actual = repr(self.factor)
assert actual == expected
Expand All @@ -24,9 +27,9 @@ class TestCategoricalRepr:
def test_big_print(self):
factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ["a", "b", "c"], fastpath=True)
expected = [
"[a, b, c, a, b, ..., b, c, a, b, c]",
"['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']",
"Length: 600",
"Categories (3, object): [a, b, c]",
"Categories (3, object): ['a', 'b', 'c']",
]
expected = "\n".join(expected)

Expand All @@ -36,13 +39,13 @@ def test_big_print(self):

def test_empty_print(self):
factor = Categorical([], ["a", "b", "c"])
expected = "[], Categories (3, object): [a, b, c]"
expected = "[], Categories (3, object): ['a', 'b', 'c']"
actual = repr(factor)
assert actual == expected

assert expected == actual
factor = Categorical([], ["a", "b", "c"], ordered=True)
expected = "[], Categories (3, object): [a < b < c]"
expected = "[], Categories (3, object): ['a' < 'b' < 'c']"
actual = repr(factor)
assert expected == actual

Expand All @@ -64,17 +67,17 @@ def test_print_none_width(self):
def test_unicode_print(self):
c = Categorical(["aaaaa", "bb", "cccc"] * 20)
expected = """\
[aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc]
['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc']
Length: 60
Categories (3, object): [aaaaa, bb, cccc]"""
Categories (3, object): ['aaaaa', 'bb', 'cccc']"""

assert repr(c) == expected

c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """\
[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa

assert repr(c) == expected

Expand All @@ -83,9 +86,9 @@ def test_unicode_print(self):
with option_context("display.unicode.east_asian_width", True):

c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
expected = """['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa

assert repr(c) == expected

Expand Down Expand Up @@ -523,3 +526,9 @@ def test_categorical_index_repr_timedelta_ordered(self):
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=True, dtype='category')""" # noqa

assert repr(i) == exp

def test_categorical_str_repr(self):
# GH 33676
result = repr(Categorical([1, "2", 3, 4]))
expected = "[1, '2', 3, 4]\nCategories (4, object): [1, 3, 4, '2']"
assert result == expected
4 changes: 2 additions & 2 deletions pandas/tests/series/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def test_categorical_repr(self):
"0 a\n1 b\n"
+ " ..\n"
+ "48 a\n49 b\n"
+ "Length: 50, dtype: category\nCategories (2, object): [a, b]"
+ "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']"
)
with option_context("display.max_rows", 5):
assert exp == repr(a)
Expand All @@ -279,7 +279,7 @@ def test_categorical_repr(self):
a = Series(Categorical(["a", "b"], categories=levs, ordered=True))
exp = (
"0 a\n1 b\n" + "dtype: category\n"
"Categories (26, object): [a < b < c < d ... w < x < y < z]"
"Categories (26, object): ['a' < 'b' < 'c' < 'd' ... 'w' < 'x' < 'y' < 'z']"
)
assert exp == a.__str__()

Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/util/test_assert_series_equal.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,10 @@ def test_series_equal_categorical_values_mismatch(check_less_precise):

Series values are different \\(66\\.66667 %\\)
\\[index\\]: \\[0, 1, 2\\]
\\[left\\]: \\[a, b, c\\]
Categories \\(3, object\\): \\[a, b, c\\]
\\[right\\]: \\[a, c, b\\]
Categories \\(3, object\\): \\[a, b, c\\]"""
\\[left\\]: \\['a', 'b', 'c'\\]
Categories \\(3, object\\): \\['a', 'b', 'c'\\]
\\[right\\]: \\['a', 'c', 'b'\\]
Categories \\(3, object\\): \\['a', 'b', 'c'\\]"""

s1 = Series(Categorical(["a", "b", "c"]))
s2 = Series(Categorical(["a", "c", "b"]))
Expand Down
Loading