pandas-dev · chrispe · May 5, 2020 · May 5, 2020 · May 5, 2020 · May 16, 2020
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -15,6 +15,7 @@
     is_sparse,
 )
 from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCSeries
+from pandas.core.dtypes.missing import isna
 
 from pandas.core.arrays import ExtensionArray
 from pandas.core.arrays.sparse import SparseArray
@@ -61,6 +62,70 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
     return arr.astype(dtype, copy=False)
 
 
+def _can_cast_to_categorical(to_cast):
+    """
+    Evaluates if a list of arrays can be casted to a single categorical dtype.
+    The categorical dtype to cast to, is determined by any of the arrays which
+    is already of categorical dtype. If no such array exists, or if the existing
+    categorical dtype does not contain any of the unique values of the other arrays,
+    then it will return False.
+
+    Parameters
+    ----------
+    to_cast : array of arrays
+
+    Returns
+    -------
+    True if possible to cast to a single categorical dtype, False otherwise.
+    """
+    if len(to_cast) == 0:
+        raise ValueError("No arrays to cast")
+
+    types = [x.dtype for x in to_cast]
+
+    # If any of the arrays is of categorical dtype, then we will use it as a reference.
+    # If no such array exists, then we just return.
+    if any(is_categorical_dtype(t) for t in types):
+        cat_dtypes = []
+        for t in types:
+            if (
+                is_categorical_dtype(t)
+                and len(t.categories.values) > 0
+                and any(~isna(t.categories.values))
+            ):
+                categorical_values_dtype = t.categories.values.dtype
+                if all(
+                    is_categorical_dtype(x) or np.can_cast(categorical_values_dtype, x)
+                    for x in types
+                ):
+                    cat_dtypes.append(t)
+        if len(cat_dtypes) == 0 or any(
+            not is_dtype_equal(dtype, cat_dtypes[0]) for dtype in cat_dtypes[1:]
+        ):
+            return False
+    else:
+        return False
+
+    def categorical_contains_values(categorical_dtype, x):
+        unique_values = np.unique(x[~isna(x)])
+        if any(
+            val not in categorical_dtype.categories for val in unique_values.tolist()
+        ):
+            return False
+        return True
+
+    if not all(
+        categorical_contains_values(to_cast[0].dtype, other) or len(other) == 0
+        for other in to_cast[1:]
+    ):
+        raise ValueError(
+            "Cannot concat on a Categorical with a new category, "
+            "set the categories first"
+        )
+
+    return True
+
+
 def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False):
     """
     provide concatenation of an array of arrays each of which is a single
@@ -108,7 +173,17 @@ def is_nonempty(x) -> bool:
         # we ignore axis here, as internally concatting with EAs is always
         # for axis=0
         if not single_dtype:
-            target_dtype = find_common_type([x.dtype for x in to_concat])
+            # Special case for handling concat with categorical series.
+            # We need to make sure that categorical dtype is preserved
+            # when an array of valid values is given (GH#25383)
+            if (
+                isinstance(to_concat[0], ExtensionArray)
+                and all(x.shape[0] == 1 for x in to_concat[1:])
+                and _can_cast_to_categorical(to_concat)
+            ):
+                target_dtype = to_concat[0].dtype
+            else:
+                target_dtype = find_common_type([x.dtype for x in to_concat])
             to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
 
         if isinstance(to_concat[0], ExtensionArray):

diff --git a/pandas/tests/series/test_categorical.py b/pandas/tests/series/test_categorical.py
@@ -0,0 +1,116 @@
+import pytest
+
+from pandas.core.dtypes.concat import _can_cast_to_categorical
+
+import pandas as pd
+from pandas import Categorical
+import pandas._testing as tm
+
+
+class TestCategoricalSeries:
+    def test_setitem_undefined_category_raises(self):
+        ser = pd.Series(Categorical(["a", "b", "c"]))
+        msg = (
+            "Cannot setitem on a Categorical with a new category, "
+            "set the categories first"
+        )
+        with pytest.raises(ValueError, match=msg):
+            ser.loc[2] = "d"
+
+    def test_concat_undefined_category_raises(self):
+        ser = pd.Series(Categorical(["a", "b", "c"]))
+        msg = (
+            "Cannot concat on a Categorical with a new category, "
+            "set the categories first"
+        )
+        with pytest.raises(ValueError, match=msg):
+            ser.loc[3] = "d"
+
+    def test_loc_category_dtype_retention(self):
+        # Case 1
+        df = pd.DataFrame(
+            {
+                "int": [0, 1, 2],
+                "cat": Categorical(["a", "b", "c"], categories=["a", "b", "c"]),
+            }
+        )
+        df.loc[3] = [3, "c"]
+        expected = pd.DataFrame(
+            {
+                "int": [0, 1, 2, 3],
+                "cat": Categorical(["a", "b", "c", "c"], categories=["a", "b", "c"]),
+            }
+        )
+        tm.assert_frame_equal(df, expected)
+
+        # Case 2
+        ser = pd.Series(Categorical(["a", "b", "c"]))
+        ser.loc[3] = "c"
+        expected = pd.Series(Categorical(["a", "b", "c", "c"]))
+        tm.assert_series_equal(ser, expected)
+
+        # Case 3
+        ser = pd.Series(Categorical([1, 2, 3]))
+        ser.loc[3] = 3
+        expected = pd.Series(Categorical([1, 2, 3, 3]))
+        tm.assert_series_equal(ser, expected)
+
+        # Case 4
+        ser = pd.Series(Categorical([1, 2, 3]))
+        ser.loc[3] = pd.NA
+        expected = pd.Series(Categorical([1, 2, 3, pd.NA]))
+        tm.assert_series_equal(ser, expected)
+
+    def test_can_cast_to_categorical(self):
+        # Case 1:
+        # Series of identical categorical dtype should
+        # be able to concat to categorical
+        ser1 = pd.Series(Categorical(["a", "b", "c"]))
+        ser2 = pd.Series(Categorical(["a", "b", "c"]))
+        arr = [ser1, ser2]
+        assert _can_cast_to_categorical(arr) is True
+
+        # Case 2:
+        # Series of non-identical categorical dtype should
+        # not be able to concat to categoorical
+        ser1 = pd.Series(Categorical(["a", "b", "c"]))
+        ser2 = pd.Series(Categorical(["a", "b", "d"]))
+        arr = [ser1, ser2]
+        assert _can_cast_to_categorical(arr) is False
+
+        # Concat of a categorical series with a series
+        # containing only values identical to the
+        # categorical values should be possible
+
+        # Case 3: For string categorical values
+        ser1 = pd.Series(Categorical(["a", "b", "c"]))
+        ser2 = pd.Series(["a", "a", "b"])
+        arr = [ser1, ser2]
+        assert _can_cast_to_categorical(arr) is True
+
+        # Case 4: For int categorical values
+        ser1 = pd.Series(Categorical([1, 2, 3]))
+        ser2 = pd.Series([1, 2])
+        arr = [ser1, ser2]
+        assert _can_cast_to_categorical(arr) is True
+
+        # The rest should raise because not all values
+        # are present in the categorical.
+
+        # Case 5
+        ser1 = pd.Series(Categorical([1, 2, 3]))
+        ser2 = pd.Series([3, 4])
+        arr = [ser1, ser2]
+        msg = (
+            "Cannot concat on a Categorical with a new category, "
+            "set the categories first"
+        )
+        with pytest.raises(ValueError, match=msg):
+            _can_cast_to_categorical(arr)
+
+        # Case 6
+        ser1 = pd.Series(Categorical(["a", "b", "c"]))
+        ser2 = pd.Series(["d", "e"])
+        arr = [ser1, ser2]
+        with pytest.raises(ValueError, match=msg):
+            _can_cast_to_categorical(arr)