Skip to content

Commit 2e59512

Browse files
JustinZhengBCPingviinituutti
authored andcommitted
BUG-19214 int categoricals are formatted as ints (pandas-dev#24494)
1 parent 7248432 commit 2e59512

File tree

4 files changed

+61
-8
lines changed

4 files changed

+61
-8
lines changed

doc/source/whatsnew/v0.24.0.rst

+37
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,40 @@ cast from integer dtype to floating dtype (:issue:`22019`)
11421142
...: 'c': [1, 1, np.nan, 1, 1]})
11431143
In [4]: pd.crosstab(df.a, df.b, normalize='columns')
11441144
1145+
.. _whatsnew_0240.api.concat_categorical:
1146+
1147+
Concatenation Changes
1148+
^^^^^^^^^^^^^^^^^^^^^
1149+
1150+
Calling :func:`pandas.concat` on a ``Categorical`` of ints with NA values now
1151+
causes them to be processed as objects when concatenating with anything
1152+
other than another ``Categorical`` of ints (:issue:`19214`)
1153+
1154+
.. ipython:: python
1155+
1156+
s = pd.Series([0, 1, np.nan])
1157+
c = pd.Series([0, 1, np.nan], dtype="category")
1158+
1159+
*Previous Behavior*
1160+
1161+
.. code-block:: ipython
1162+
1163+
In [3]: pd.concat([s, c])
1164+
Out[3]:
1165+
0 0.0
1166+
1 1.0
1167+
2 NaN
1168+
0 0.0
1169+
1 1.0
1170+
2 NaN
1171+
dtype: float64
1172+
1173+
*New Behavior*
1174+
1175+
.. ipython:: python
1176+
1177+
pd.concat([s, c])
1178+
11451179
Datetimelike API Changes
11461180
^^^^^^^^^^^^^^^^^^^^^^^^
11471181

@@ -1623,6 +1657,9 @@ MultiIndex
16231657
I/O
16241658
^^^
16251659

1660+
- Bug where integer categorical data would be formatted as floats if ``NaN`` values were present (:issue:`19214`)
1661+
1662+
16261663
.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
16271664

16281665
Proper handling of `np.NaN` in a string data-typed column with the Python engine

pandas/core/arrays/categorical.py

+3
Original file line numberDiff line numberDiff line change
@@ -1491,6 +1491,9 @@ def get_values(self):
14911491
# if we are a datetime and period index, return Index to keep metadata
14921492
if is_datetimelike(self.categories):
14931493
return self.categories.take(self._codes, fill_value=np.nan)
1494+
elif is_integer_dtype(self.categories) and -1 in self._codes:
1495+
return self.categories.astype("object").take(self._codes,
1496+
fill_value=np.nan)
14941497
return np.array(self)
14951498

14961499
def check_for_ordered(self, op):

pandas/tests/arrays/categorical/test_repr.py

+11
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,17 @@ def test_categorical_repr_datetime_ordered(self):
240240

241241
assert repr(c) == exp
242242

243+
def test_categorical_repr_int_with_nan(self):
244+
c = Categorical([1, 2, np.nan])
245+
c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]"""
246+
assert repr(c) == c_exp
247+
248+
s = Series([1, 2, np.nan], dtype="object").astype("category")
249+
s_exp = """0 1\n1 2\n2 NaN
250+
dtype: category
251+
Categories (2, int64): [1, 2]"""
252+
assert repr(s) == s_exp
253+
243254
def test_categorical_repr_period(self):
244255
idx = period_range('2011-01-01 09:00', freq='H', periods=5)
245256
c = Categorical(idx)

pandas/tests/reshape/test_concat.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ def test_concat_categorical(self):
495495
s1 = pd.Series([10, 11, np.nan], dtype='category')
496496
s2 = pd.Series([np.nan, 1, 3, 2], dtype='category')
497497

498-
exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2])
498+
exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype='object')
499499
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
500500
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
501501

@@ -515,12 +515,12 @@ def test_concat_categorical_coercion(self):
515515
s1 = pd.Series([1, 2, np.nan], dtype='category')
516516
s2 = pd.Series([2, 1, 2])
517517

518-
exp = pd.Series([1, 2, np.nan, 2, 1, 2])
518+
exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='object')
519519
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
520520
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
521521

522522
# result shouldn't be affected by 1st elem dtype
523-
exp = pd.Series([2, 1, 2, 1, 2, np.nan])
523+
exp = pd.Series([2, 1, 2, 1, 2, np.nan], dtype='object')
524524
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
525525
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
526526

@@ -540,11 +540,11 @@ def test_concat_categorical_coercion(self):
540540
s1 = pd.Series([10, 11, np.nan], dtype='category')
541541
s2 = pd.Series([1, 3, 2])
542542

543-
exp = pd.Series([10, 11, np.nan, 1, 3, 2])
543+
exp = pd.Series([10, 11, np.nan, 1, 3, 2], dtype='object')
544544
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
545545
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
546546

547-
exp = pd.Series([1, 3, 2, 10, 11, np.nan])
547+
exp = pd.Series([1, 3, 2, 10, 11, np.nan], dtype='object')
548548
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
549549
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
550550

@@ -580,11 +580,13 @@ def test_concat_categorical_3elem_coercion(self):
580580
s2 = pd.Series([2, 1, 2], dtype='category')
581581
s3 = pd.Series([1, 2, 1, 2, np.nan])
582582

583-
exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan])
583+
exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan],
584+
dtype='object')
584585
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
585586
tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp)
586587

587-
exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2])
588+
exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2],
589+
dtype='object')
588590
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
589591
tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp)
590592

@@ -668,7 +670,7 @@ def test_concat_categorical_coercion_nan(self):
668670
s1 = pd.Series([1, np.nan], dtype='category')
669671
s2 = pd.Series([np.nan, np.nan])
670672

671-
exp = pd.Series([1, np.nan, np.nan, np.nan])
673+
exp = pd.Series([1, np.nan, np.nan, np.nan], dtype='object')
672674
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
673675
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
674676

0 commit comments

Comments
 (0)