diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 1fe5e4e6e7087..5e349c2f06472 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1140,6 +1140,40 @@ cast from integer dtype to floating dtype (:issue:`22019`) ...: 'c': [1, 1, np.nan, 1, 1]}) In [4]: pd.crosstab(df.a, df.b, normalize='columns') +.. _whatsnew_0240.api.concat_categorical: + +Concatenation Changes +^^^^^^^^^^^^^^^^^^^^^ + +Calling :func:`pandas.concat` on a ``Categorical`` of ints with NA values now +causes them to be processed as objects when concatenating with anything +other than another ``Categorical`` of ints (:issue:`19214`) + +.. ipython:: python + + s = pd.Series([0, 1, np.nan]) + c = pd.Series([0, 1, np.nan], dtype="category") + +*Previous Behavior* + +.. code-block:: ipython + + In [3]: pd.concat([s, c]) + Out[3]: + 0 0.0 + 1 1.0 + 2 NaN + 0 0.0 + 1 1.0 + 2 NaN + dtype: float64 + +*New Behavior* + +.. ipython:: python + + pd.concat([s, c]) + Datetimelike API Changes ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1546,6 +1580,9 @@ MultiIndex I/O ^^^ +- Bug where integer categorical data would be formatted as floats if ``NaN`` values were present (:issue:`19214`) + + .. _whatsnew_0240.bug_fixes.nan_with_str_dtype: Proper handling of `np.NaN` in a string data-typed column with the Python engine diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a47406cded7b4..47fe2aa0b93fc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1520,6 +1520,9 @@ def get_values(self): # if we are a datetime and period index, return Index to keep metadata if is_datetimelike(self.categories): return self.categories.take(self._codes, fill_value=np.nan) + elif is_integer_dtype(self.categories) and -1 in self._codes: + return self.categories.astype("object").take(self._codes, + fill_value=np.nan) return np.array(self) def check_for_ordered(self, op): diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index 227edf60951e6..08b32a216ffb6 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -240,6 +240,17 @@ def test_categorical_repr_datetime_ordered(self): assert repr(c) == exp + def test_categorical_repr_int_with_nan(self): + c = Categorical([1, 2, np.nan]) + c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]""" + assert repr(c) == c_exp + + s = Series([1, 2, np.nan], dtype="object").astype("category") + s_exp = """0 1\n1 2\n2 NaN +dtype: category +Categories (2, int64): [1, 2]""" + assert repr(s) == s_exp + def test_categorical_repr_period(self): idx = period_range('2011-01-01 09:00', freq='H', periods=5) c = Categorical(idx) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 0706cb12ac5d0..481f9f0a56812 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -496,7 +496,7 @@ def test_concat_categorical(self): s1 = pd.Series([10, 11, np.nan], dtype='category') s2 = pd.Series([np.nan, 1, 3, 2], dtype='category') - exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2]) + exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype='object') tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) @@ -516,12 +516,12 @@ def test_concat_categorical_coercion(self): s1 = pd.Series([1, 2, np.nan], dtype='category') s2 = pd.Series([2, 1, 2]) - exp = pd.Series([1, 2, np.nan, 2, 1, 2]) + exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='object') tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # result shouldn't be affected by 1st elem dtype - exp = pd.Series([2, 1, 2, 1, 2, np.nan]) + exp = pd.Series([2, 1, 2, 1, 2, np.nan], dtype='object') tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) @@ -541,11 +541,11 @@ def test_concat_categorical_coercion(self): s1 = pd.Series([10, 11, np.nan], dtype='category') s2 = pd.Series([1, 3, 2]) - exp = pd.Series([10, 11, np.nan, 1, 3, 2]) + exp = pd.Series([10, 11, np.nan, 1, 3, 2], dtype='object') tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - exp = pd.Series([1, 3, 2, 10, 11, np.nan]) + exp = pd.Series([1, 3, 2, 10, 11, np.nan], dtype='object') tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) @@ -581,11 +581,13 @@ def test_concat_categorical_3elem_coercion(self): s2 = pd.Series([2, 1, 2], dtype='category') s3 = pd.Series([1, 2, 1, 2, np.nan]) - exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan]) + exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], + dtype='object') tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2]) + exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], + dtype='object') tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) @@ -669,7 +671,7 @@ def test_concat_categorical_coercion_nan(self): s1 = pd.Series([1, np.nan], dtype='category') s2 = pd.Series([np.nan, np.nan]) - exp = pd.Series([1, np.nan, np.nan, np.nan]) + exp = pd.Series([1, np.nan, np.nan, np.nan], dtype='object') tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)