pandas-dev · jreback · May 2, 2020 · Apr 17, 2020 · Apr 17, 2020 · Apr 17, 2020
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2349,9 +2349,9 @@ def _can_hold_na(self):
 
     @classmethod
     def _concat_same_type(self, to_concat):
-        from pandas.core.dtypes.concat import concat_categorical
+        from pandas.core.dtypes.concat import union_categoricals
 
-        return concat_categorical(to_concat)
+        return union_categoricals(to_concat)
 
     def isin(self, values):
         """

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -1,11 +1,11 @@
 import numbers
-from typing import TYPE_CHECKING, Tuple, Type, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
 import warnings
 
 import numpy as np
 
 from pandas._libs import lib, missing as libmissing
-from pandas._typing import ArrayLike
+from pandas._typing import ArrayLike, DtypeObj
 from pandas.compat import set_function_name
 from pandas.util._decorators import cache_readonly
 
@@ -95,6 +95,15 @@ def construct_array_type(cls) -> Type["IntegerArray"]:
         """
         return IntegerArray
 
+    def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
+        # for now only handle other integer types
+        if not all(isinstance(t, _IntegerDtype) for t in dtypes):
+            return None
+        np_dtype = np.find_common_type([t.numpy_dtype for t in dtypes], [])
+        if np.issubdtype(np_dtype, np.integer):
+            return _dtypes[str(np_dtype)]
+        return None
+
     def __from_arrow__(
         self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
     ) -> "IntegerArray":

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -952,27 +952,7 @@ def copy(self):
 
     @classmethod
     def _concat_same_type(cls, to_concat):
-        fill_values = [x.fill_value for x in to_concat]
-
-        fill_value = fill_values[0]
-
-        # np.nan isn't a singleton, so we may end up with multiple
-        # NaNs here, so we ignore tha all NA case too.
-        if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
-            warnings.warn(
-                "Concatenating sparse arrays with multiple fill "
-                f"values: '{fill_values}'. Picking the first and "
-                "converting the rest.",
-                PerformanceWarning,
-                stacklevel=6,
-            )
-            keep = to_concat[0]
-            to_concat2 = [keep]
-
-            for arr in to_concat[1:]:
-                to_concat2.append(cls(np.asarray(arr), fill_value=fill_value))
-
-            to_concat = to_concat2
+        fill_value = to_concat[0].fill_value
 
         values = []
         length = 0

diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
@@ -1,11 +1,13 @@
 """Sparse Dtype"""
 
 import re
-from typing import TYPE_CHECKING, Any, Tuple, Type
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type
+import warnings
 
 import numpy as np
 
-from pandas._typing import Dtype
+from pandas._typing import Dtype, DtypeObj
+from pandas.errors import PerformanceWarning
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.cast import astype_nansafe
@@ -352,3 +354,23 @@ def _subtype_with_str(self):
         if isinstance(self.fill_value, str):
             return type(self.fill_value)
         return self.subtype
+
+    def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
+
+        fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
+        fill_value = fill_values[0]
+
+        # np.nan isn't a singleton, so we may end up with multiple
+        # NaNs here, so we ignore tha all NA case too.
+        if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
+            warnings.warn(
+                "Concatenating sparse arrays with multiple fill "
+                f"values: '{fill_values}'. Picking the first and "
+                "converting the rest.",
+                PerformanceWarning,
+                stacklevel=6,
+            )
+
+        # TODO also handle non-numpy other dtypes
+        np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
+        return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)
diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 
+from pandas._typing import DtypeObj
 from pandas.errors import AbstractMethodError
 
 from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
@@ -322,3 +323,34 @@ def _is_boolean(self) -> bool:
         bool
         """
         return False
+
+    def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
+        """
+        Return the common dtype, if one exists.
+
+        Used in `find_common_type` implementation. This is for example used
+        to determine the resulting dtype in a concat operation.
+
+        If no common dtype exists, return None. If all dtypes in the list
+        will return None, then the common dtype will be "object" dtype.
+
+        Parameters
+        ----------
+        dtypes : list of dtypes
+            The dtypes for which to determine a common dtype. This is a list
+            of np.dtype or ExtensionDtype instances.
+
+        Returns
+        -------
+        Common dtype (np.dtype or ExtensionDtype) or None
+        """
+        # QUESTIONS:
+        # - do we guarantee that `dtypes` is already deduplicated? (list of uniques)
+        # - do we call this method if `len(dtypes) == 1`, or does this method
+        #   need to handle that case
+        # - does this method need to handle "non-fully-initialized" dtypes?
+        if len(set(dtypes)) == 1:
+            # only itself
+            return self
+        else:
+            return None
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1474,7 +1474,12 @@ def find_common_type(types):
         return first
 
     if any(isinstance(t, ExtensionDtype) for t in types):
-        return np.object
+        for t in types:
+            if isinstance(t, ExtensionDtype):
+                res = t._get_common_type(types)
+                if res is not None:
+                    return res
+        return np.dtype("object")
 
     # take lowest unit
     if all(is_datetime64_dtype(t) for t in types):

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -4,6 +4,9 @@
 
 import numpy as np
 
+from pandas._typing import ArrayLike, DtypeObj
+
+from pandas.core.dtypes.cast import find_common_type
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_categorical_dtype,
@@ -17,6 +20,9 @@
 )
 from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries
 
+from pandas.core.arrays import ExtensionArray
+from pandas.core.construction import array
+
 
 def get_dtype_kinds(l):
     """
@@ -58,6 +64,40 @@ def get_dtype_kinds(l):
     return typs
 
 
+def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
+    """
+    Helper function for `arr.astype(common_type)` but handling all special
+    cases.
+    """
+    if (
+        is_categorical_dtype(arr.dtype)
+        and isinstance(dtype, np.dtype)
+        and np.issubdtype(dtype, np.integer)
+    ):
+        # problem case: categorical of int -> gives int as result dtype,
+        # but categorical can contain NAs -> fall back to object dtype
+        try:
+            return arr.astype(dtype, copy=False)
+        except ValueError:
+            return arr.astype(object, copy=False)
+
+    if (
+        isinstance(arr, np.ndarray)
+        and arr.dtype.kind in ["m", "M"]
+        and dtype is np.dtype("object")
+    ):
+        # wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta
+        # this can happen when concat_compat is called directly on arrays (when arrays
+        # are not coming from Index/Series._values), eg in BlockManager.quantile
+        arr = array(arr)
+
+    if is_extension_array_dtype(dtype):
+        if isinstance(arr, np.ndarray):
+            # numpy's astype cannot handle ExtensionDtypes
+            return array(arr, dtype=dtype, copy=False)
+    return arr.astype(dtype, copy=False)
+
+
 def concat_compat(to_concat, axis: int = 0):
     """
     provide concatenation of an array of arrays each of which is a single
@@ -93,28 +133,25 @@ def is_nonempty(x) -> bool:
 
     typs = get_dtype_kinds(to_concat)
     _contains_datetime = any(typ.startswith("datetime") for typ in typs)
-    _contains_period = any(typ.startswith("period") for typ in typs)
 
     all_empty = not len(non_empties)
     single_dtype = len({x.dtype for x in to_concat}) == 1
     any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat)
 
-    if any_ea and single_dtype and axis == 0:
-        cls = type(to_concat[0])
-        return cls._concat_same_type(to_concat)
+    if any_ea and axis == 0:
+        if not single_dtype:
+            target_dtype = find_common_type([x.dtype for x in to_concat])
+            to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
 
-    elif "category" in typs:
-        # this must be prior to concat_datetime,
-        # to support Categorical + datetime-like
-        return concat_categorical(to_concat, axis=axis)
+        if isinstance(to_concat[0], ExtensionArray):
+            cls = type(to_concat[0])
+            return cls._concat_same_type(to_concat)
+        else:
+            return np.concatenate(to_concat)
 
-    elif _contains_datetime or "timedelta" in typs or _contains_period:
+    elif _contains_datetime or "timedelta" in typs:
         return concat_datetime(to_concat, axis=axis, typs=typs)
 
-    # these are mandated to handle empties as well
-    elif "sparse" in typs:
-        return _concat_sparse(to_concat, axis=axis, typs=typs)
-
     elif any_ea and axis == 1:
         to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
         return np.concatenate(to_concat, axis=axis)
@@ -136,52 +173,6 @@ def is_nonempty(x) -> bool:
     return np.concatenate(to_concat, axis=axis)
 
 
-def concat_categorical(to_concat, axis: int = 0):
-    """
-    Concatenate an object/categorical array of arrays, each of which is a
-    single dtype
-
-    Parameters
-    ----------
-    to_concat : array of arrays
-    axis : int
-        Axis to provide concatenation in the current implementation this is
-        always 0, e.g. we only have 1D categoricals
-
-    Returns
-    -------
-    Categorical
-        A single array, preserving the combined dtypes
-    """
-    # we could have object blocks and categoricals here
-    # if we only have a single categoricals then combine everything
-    # else its a non-compat categorical
-    categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]
-
-    # validate the categories
-    if len(categoricals) != len(to_concat):
-        pass
-    else:
-        # when all categories are identical
-        first = to_concat[0]
-        if all(first.is_dtype_equal(other) for other in to_concat[1:]):
-            return union_categoricals(categoricals)
-
-    # extract the categoricals & coerce to object if needed
-    to_concat = [
-        x._internal_get_values()
-        if is_categorical_dtype(x.dtype)
-        else np.asarray(x).ravel()
-        if not is_datetime64tz_dtype(x)
-        else np.asarray(x.astype(object))
-        for x in to_concat
-    ]
-    result = concat_compat(to_concat)
-    if axis == 1:
-        result = result.reshape(1, len(result))
-    return result
-
-
 def union_categoricals(
     to_union, sort_categories: bool = False, ignore_order: bool = False
 ):
@@ -414,34 +405,3 @@ def _wrap_datetimelike(arr):
     if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]:
         arr = pd_array(arr)
     return arr
-
-
-def _concat_sparse(to_concat, axis=0, typs=None):
-    """
-    provide concatenation of an sparse/dense array of arrays each of which is a
-    single dtype
-
-    Parameters
-    ----------
-    to_concat : array of arrays
-    axis : axis to provide concatenation
-    typs : set of to_concat dtypes
-
-    Returns
-    -------
-    a single array, preserving the combined dtypes
-    """
-    from pandas.core.arrays import SparseArray
-
-    fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)]
-    fill_value = fill_values[0]
-
-    # TODO: Fix join unit generation so we aren't passed this.
-    to_concat = [
-        x
-        if isinstance(x, SparseArray)
-        else SparseArray(x.squeeze(), fill_value=fill_value)
-        for x in to_concat
-    ]
-
-    return SparseArray._concat_same_type(to_concat)
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -21,7 +21,7 @@
 
 from pandas._libs.interval import Interval
 from pandas._libs.tslibs import NaT, Period, Timestamp, timezones
-from pandas._typing import Ordered
+from pandas._typing import DtypeObj, Ordered
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass
@@ -640,6 +640,32 @@ def _is_boolean(self) -> bool:
 
         return is_bool_dtype(self.categories)
 
+    def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
+        # check if we have all categorical dtype with identical categories
+        if all(isinstance(x, CategoricalDtype) for x in dtypes):
+            first = dtypes[0]
+            if all(first == other for other in dtypes[1:]):
+                return first
+
+        # special case non-initialized categorical
+        # TODO we should figure out the expected return value in general
+        non_init_cats = [
+            isinstance(x, CategoricalDtype) and x.categories is None for x in dtypes
+        ]
+        if all(non_init_cats):
+            return self
+        elif any(non_init_cats):
+            return None
+
+        # extract the categories' dtype
+        non_cat_dtypes = [
+            x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes
+        ]
+        # TODO should categorical always give an answer?
+        from pandas.core.dtypes.cast import find_common_type
+
+        return find_common_type(non_cat_dtypes)
+
 
 @register_extension_dtype
 class DatetimeTZDtype(PandasExtensionDtype):