Skip to content

BUG: repr of Categorical does not distinguish int and str. #34222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jun 24, 2020
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,7 @@ I/O
- Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`)
- Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`)
- :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`)
- Repr of :class:`Categorical` was not distinguishing between int and str (:issue:`33676`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The is probably more appropriate for the Categorical section to be consistent with the similar change for Sparse, see #34352. (NOTE: 1.1 does not yet have a Categorical section)

Also, I think safe to say this was a bug since the repr of the categories in the dtype repr included the quotes #34352 (comment)


Plotting
^^^^^^^^
Expand Down
15 changes: 11 additions & 4 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from csv import QUOTE_NONNUMERIC
import operator
from shutil import get_terminal_size
from typing import Dict, Hashable, List, Type, Union, cast
Expand Down Expand Up @@ -1874,11 +1875,17 @@ def _repr_categories(self):

if len(self.categories) > max_categories:
num = max_categories // 2
head = fmt.format_array(self.categories[:num], None)
tail = fmt.format_array(self.categories[-num:], None)
head = fmt.format_array(
self.categories[:num], None, quoting=QUOTE_NONNUMERIC
)
tail = fmt.format_array(
self.categories[-num:], None, quoting=QUOTE_NONNUMERIC
)
category_strs = head + ["..."] + tail
else:
category_strs = fmt.format_array(self.categories, None)
category_strs = fmt.format_array(
self.categories, None, quoting=QUOTE_NONNUMERIC
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe partial here to avoid duplication


# Strip all leading spaces, which format_array adds for columns...
category_strs = [x.strip() for x in category_strs]
Expand Down Expand Up @@ -1921,7 +1928,7 @@ def _get_repr(self, length=True, na_rep="NaN", footer=True) -> str:
from pandas.io.formats import format as fmt

formatter = fmt.CategoricalFormatter(
self, length=length, na_rep=na_rep, footer=footer
self, length=length, na_rep=na_rep, footer=footer, quoting=QUOTE_NONNUMERIC
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe don't need to pass this through here, since CategoricalFormatter should always be QUOTE_NONNUMERIC

)
result = formatter.to_string()
return str(result)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,8 +743,8 @@ def array(self) -> ExtensionArray:

>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
>>> ser.array
[a, b, a]
Categories (2, object): [a, b]
['a', 'b', 'a']
Categories (2, object): ['a', 'b']
"""
raise AbstractMethodError(self)

Expand Down
13 changes: 12 additions & 1 deletion pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

from contextlib import contextmanager
from csv import QUOTE_NONE
from datetime import tzinfo
import decimal
from functools import partial
Expand Down Expand Up @@ -170,12 +171,14 @@ def __init__(
length: bool = True,
na_rep: str = "NaN",
footer: bool = True,
quoting: Optional[int] = None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe redundant see above comment.

):
self.categorical = categorical
self.buf = buf if buf is not None else StringIO("")
self.na_rep = na_rep
self.length = length
self.footer = footer
self.quoting = quoting
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same


def _get_footer(self) -> str:
footer = ""
Expand All @@ -200,6 +203,7 @@ def _get_formatted_values(self) -> List[str]:
None,
float_format=None,
na_rep=self.na_rep,
quoting=self.quoting,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

quoting always QUOTE_NONNUMERIC

)

def to_string(self) -> str:
Expand Down Expand Up @@ -1109,6 +1113,7 @@ def format_array(
justify: str = "right",
decimal: str = ".",
leading_space: Optional[bool] = None,
quoting: Optional[int] = None,
) -> List[str]:
"""
Format an array for printing.
Expand Down Expand Up @@ -1171,6 +1176,7 @@ def format_array(
justify=justify,
decimal=decimal,
leading_space=leading_space,
quoting=quoting,
)

return fmt_obj.get_result()
Expand Down Expand Up @@ -1216,10 +1222,15 @@ def _format_strings(self) -> List[str]:
else:
float_format = self.float_format

quote_strings = self.quoting is not None and self.quoting != QUOTE_NONE
formatter = (
self.formatter
if self.formatter is not None
else (lambda x: pprint_thing(x, escape_chars=("\t", "\r", "\n")))
else (
lambda x: pprint_thing(
x, escape_chars=("\t", "\r", "\n"), quote_strings=quote_strings
)
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as the ternary expression is getting more complex, an if else may now be more readable. maybe use partial instead of lambda and maybe move quote_strings assignment inside the relevant if else block.

)

def _format(x):
Expand Down
31 changes: 20 additions & 11 deletions pandas/tests/arrays/categorical/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

class TestCategoricalReprWithFactor(TestCategorical):
def test_print(self):
expected = ["[a, b, b, a, a, c, c, c]", "Categories (3, object): [a < b < c]"]
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, object): ['a' < 'b' < 'c']",
]
expected = "\n".join(expected)
actual = repr(self.factor)
assert actual == expected
Expand All @@ -24,9 +27,9 @@ class TestCategoricalRepr:
def test_big_print(self):
factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ["a", "b", "c"], fastpath=True)
expected = [
"[a, b, c, a, b, ..., b, c, a, b, c]",
"['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']",
"Length: 600",
"Categories (3, object): [a, b, c]",
"Categories (3, object): ['a', 'b', 'c']",
]
expected = "\n".join(expected)

Expand All @@ -36,13 +39,13 @@ def test_big_print(self):

def test_empty_print(self):
factor = Categorical([], ["a", "b", "c"])
expected = "[], Categories (3, object): [a, b, c]"
expected = "[], Categories (3, object): ['a', 'b', 'c']"
actual = repr(factor)
assert actual == expected

assert expected == actual
factor = Categorical([], ["a", "b", "c"], ordered=True)
expected = "[], Categories (3, object): [a < b < c]"
expected = "[], Categories (3, object): ['a' < 'b' < 'c']"
actual = repr(factor)
assert expected == actual

Expand All @@ -64,17 +67,17 @@ def test_print_none_width(self):
def test_unicode_print(self):
c = Categorical(["aaaaa", "bb", "cccc"] * 20)
expected = """\
[aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc]
['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc']
Length: 60
Categories (3, object): [aaaaa, bb, cccc]"""
Categories (3, object): ['aaaaa', 'bb', 'cccc']"""

assert repr(c) == expected

c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """\
[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa

assert repr(c) == expected

Expand All @@ -83,9 +86,9 @@ def test_unicode_print(self):
with option_context("display.unicode.east_asian_width", True):

c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう]
expected = """['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa

assert repr(c) == expected

Expand Down Expand Up @@ -523,3 +526,9 @@ def test_categorical_index_repr_timedelta_ordered(self):
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=True, dtype='category')""" # noqa

assert repr(i) == exp

def test_categorical_str_repr(self):
# GH 33676
result = repr(Categorical([1, "2", 3, 4]))
expected = "[1, '2', 3, 4]\nCategories (4, object): [1, 3, 4, '2']"
assert result == expected
4 changes: 2 additions & 2 deletions pandas/tests/series/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def test_categorical_repr(self):
"0 a\n1 b\n"
+ " ..\n"
+ "48 a\n49 b\n"
+ "Length: 50, dtype: category\nCategories (2, object): [a, b]"
+ "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']"
)
with option_context("display.max_rows", 5):
assert exp == repr(a)
Expand All @@ -279,7 +279,7 @@ def test_categorical_repr(self):
a = Series(Categorical(["a", "b"], categories=levs, ordered=True))
exp = (
"0 a\n1 b\n" + "dtype: category\n"
"Categories (26, object): [a < b < c < d ... w < x < y < z]"
"Categories (26, object): ['a' < 'b' < 'c' < 'd' ... 'w' < 'x' < 'y' < 'z']"
)
assert exp == a.__str__()

Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/util/test_assert_series_equal.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,10 @@ def test_series_equal_categorical_values_mismatch(check_less_precise):

Series values are different \\(66\\.66667 %\\)
\\[index\\]: \\[0, 1, 2\\]
\\[left\\]: \\[a, b, c\\]
Categories \\(3, object\\): \\[a, b, c\\]
\\[right\\]: \\[a, c, b\\]
Categories \\(3, object\\): \\[a, b, c\\]"""
\\[left\\]: \\['a', 'b', 'c'\\]
Categories \\(3, object\\): \\['a', 'b', 'c'\\]
\\[right\\]: \\['a', 'c', 'b'\\]
Categories \\(3, object\\): \\['a', 'b', 'c'\\]"""

s1 = Series(Categorical(["a", "b", "c"]))
s2 = Series(Categorical(["a", "c", "b"]))
Expand Down