diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b3fb3459891e0..ab53fadb06681 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -30,6 +30,7 @@ ensure_platform_int, is_categorical_dtype, is_datetime64_dtype, + is_datetime64tz_dtype, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -2348,10 +2349,82 @@ def _can_hold_na(self): return True @classmethod - def _concat_same_type(self, to_concat): - from pandas.core.dtypes.concat import concat_categorical + def _concat_same_type(cls, to_concat): + return cls._concat_arrays(to_concat) + # TODO: lock down stricter behavior? - return concat_categorical(to_concat) + @classmethod + def _concat_same_dtype( + cls, + to_concat, + axis: int = 0, + sort_categories: bool = False, + ignore_order: bool = False, + ): + """ + Like _concat_same_type, but with the added restriction of matching dtypes. + """ + ordered = False + + first = to_concat[0] + + # identical categories - fastpath + categories = first.categories + ordered = first.ordered + + if all(first.categories.equals(other.categories) for other in to_concat[1:]): + new_codes = np.concatenate([c.codes for c in to_concat]) + else: + codes = [first.codes] + [ + recode_for_categories(other.codes, other.categories, first.categories) + for other in to_concat[1:] + ] + new_codes = np.concatenate(codes) + + if sort_categories and not ignore_order and ordered: + raise TypeError("Cannot use sort_categories=True with ordered Categoricals") + + if sort_categories and not categories.is_monotonic_increasing: + categories = categories.sort_values() + indexer = categories.get_indexer(first.categories) + + new_codes = take_1d(indexer, new_codes, fill_value=-1) + + if ignore_order: + ordered = False + + return cls(new_codes, categories=categories, ordered=ordered, fastpath=True) + + @classmethod + def _concat_arrays(cls, to_concat, axis: int = 0): + from pandas.core.dtypes.concat import concat_compat, union_categoricals + + categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] + + # validate the categories + if len(categoricals) != len(to_concat): + pass + else: + # when all categories are identical + first = to_concat[0] + if all(first.is_dtype_equal(other) for other in to_concat[1:]): + return union_categoricals(categoricals) + + # extract the categoricals & coerce to object if needed + to_concat = [ + x._internal_get_values() + if is_categorical_dtype(x.dtype) + else np.asarray(x).ravel() + if not is_datetime64tz_dtype(x) + else np.asarray(x.astype(object)) + for x in to_concat + ] + + result = concat_compat(to_concat) + if axis == 1: + # TODO(EA2D): this is a kludge for 1D EAs + result = result.reshape(1, len(result)) + return result def isin(self, values): """ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ece92acae6461..dc7a2c0c17d00 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -24,6 +24,7 @@ is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, + is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_list_like, @@ -751,6 +752,30 @@ def _concat_same_type(cls, to_concat, axis: int = 0): return cls._simple_new(values, dtype=dtype, freq=new_freq) + @classmethod + def _concat_arrays(cls, to_concat, axis: int = 0): + from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array + + to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat] + + if len({x.dtype for x in to_concat}) == 1: + if axis == 1 and is_extension_array_dtype(to_concat[0].dtype): + # TODO(EA2D): not necessary with 2D EAs + axis = 0 + + result = cls._concat_same_type(to_concat, axis=axis) + + if axis == 1 and result.ndim == 1: + # TODO(EA2D): not necessary with 2D EAs + result = result.reshape(1, -1) + return result + + to_concat = [x.astype(object) for x in to_concat] + if axis == 1: + # TODO(EA2D): not necessary with 2D EAs + to_concat = [np.atleast_2d(x) for x in to_concat] + return np.concatenate(to_concat, axis=axis) + def copy(self): values = self.asi8.copy() return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 620e157ee54ec..8b5a484407738 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1022,6 +1022,19 @@ def _concat_same_type(cls, to_concat): return cls(data, sparse_index=sp_index, fill_value=fill_value) + @classmethod + def _concat_arrays(cls, to_concat, axis: int = 0): + fill_values = [x.fill_value for x in to_concat if isinstance(x, cls)] + fill_value = fill_values[0] + + # TODO: Fix join unit generation so we aren't passed this. + to_concat = [ + x if isinstance(x, cls) else cls(x.squeeze(), fill_value=fill_value) + for x in to_concat + ] + + return cls._concat_same_type(to_concat) + def astype(self, dtype=None, copy=True): """ Change the dtype of a SparseArray. diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 257c4fe3c6d30..529426a3331fd 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -95,6 +95,11 @@ def is_nonempty(x) -> bool: _contains_datetime = any(typ.startswith("datetime") for typ in typs) _contains_period = any(typ.startswith("period") for typ in typs) + from pandas.core.arrays import Categorical, SparseArray, datetimelike as dtl + from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array + + to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat] + all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) @@ -106,14 +111,15 @@ def is_nonempty(x) -> bool: elif "category" in typs: # this must be prior to concat_datetime, # to support Categorical + datetime-like - return concat_categorical(to_concat, axis=axis) + return Categorical._concat_arrays(to_concat, axis=axis) elif _contains_datetime or "timedelta" in typs or _contains_period: - return concat_datetime(to_concat, axis=axis, typs=typs) + obj = [x for x in to_concat if isinstance(x, dtl.DatetimeLikeArrayMixin)][0] + return type(obj)._concat_arrays(to_concat, axis=axis) # these are mandated to handle empties as well elif "sparse" in typs: - return _concat_sparse(to_concat, axis=axis, typs=typs) + return SparseArray._concat_arrays(to_concat, axis=axis) elif any_ea and axis == 1: to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] @@ -136,52 +142,6 @@ def is_nonempty(x) -> bool: return np.concatenate(to_concat, axis=axis) -def concat_categorical(to_concat, axis: int = 0): - """ - Concatenate an object/categorical array of arrays, each of which is a - single dtype - - Parameters - ---------- - to_concat : array of arrays - axis : int - Axis to provide concatenation in the current implementation this is - always 0, e.g. we only have 1D categoricals - - Returns - ------- - Categorical - A single array, preserving the combined dtypes - """ - # we could have object blocks and categoricals here - # if we only have a single categoricals then combine everything - # else its a non-compat categorical - categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] - - # validate the categories - if len(categoricals) != len(to_concat): - pass - else: - # when all categories are identical - first = to_concat[0] - if all(first.is_dtype_equal(other) for other in to_concat[1:]): - return union_categoricals(categoricals) - - # extract the categoricals & coerce to object if needed - to_concat = [ - x._internal_get_values() - if is_categorical_dtype(x.dtype) - else np.asarray(x).ravel() - if not is_datetime64tz_dtype(x) - else np.asarray(x.astype(object)) - for x in to_concat - ] - result = concat_compat(to_concat) - if axis == 1: - result = result.reshape(1, len(result)) - return result - - def union_categoricals( to_union, sort_categories: bool = False, ignore_order: bool = False ): @@ -309,28 +269,10 @@ def _maybe_unwrap(x): ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath - categories = first.categories - ordered = first.ordered - - if all(first.categories.equals(other.categories) for other in to_union[1:]): - new_codes = np.concatenate([c.codes for c in to_union]) - else: - codes = [first.codes] + [ - recode_for_categories(other.codes, other.categories, first.categories) - for other in to_union[1:] - ] - new_codes = np.concatenate(codes) + return Categorical._concat_same_dtype( + to_union, sort_categories=sort_categories, ignore_order=ignore_order, + ) - if sort_categories and not ignore_order and ordered: - raise TypeError("Cannot use sort_categories=True with ordered Categoricals") - - if sort_categories and not categories.is_monotonic_increasing: - categories = categories.sort_values() - indexer = categories.get_indexer(first.categories) - - from pandas.core.algorithms import take_1d - - new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) @@ -354,94 +296,3 @@ def _maybe_unwrap(x): ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) - - -def _concatenate_2d(to_concat, axis: int): - # coerce to 2d if needed & concatenate - if axis == 1: - to_concat = [np.atleast_2d(x) for x in to_concat] - return np.concatenate(to_concat, axis=axis) - - -def concat_datetime(to_concat, axis=0, typs=None): - """ - provide concatenation of an datetimelike array of arrays each of which is a - single M8[ns], datetimet64[ns, tz] or m8[ns] dtype - - Parameters - ---------- - to_concat : array of arrays - axis : axis to provide concatenation - typs : set of to_concat dtypes - - Returns - ------- - a single array, preserving the combined dtypes - """ - if typs is None: - typs = get_dtype_kinds(to_concat) - - to_concat = [_wrap_datetimelike(x) for x in to_concat] - single_dtype = len({x.dtype for x in to_concat}) == 1 - - # multiple types, need to coerce to object - if not single_dtype: - # wrap_datetimelike ensures that astype(object) wraps in Timestamp/Timedelta - return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) - - if axis == 1: - # TODO(EA2D): kludge not necessary with 2D EAs - to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat] - - result = type(to_concat[0])._concat_same_type(to_concat, axis=axis) - - if result.ndim == 2 and is_extension_array_dtype(result.dtype): - # TODO(EA2D): kludge not necessary with 2D EAs - assert result.shape[0] == 1 - result = result[0] - return result - - -def _wrap_datetimelike(arr): - """ - Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray. - - DTA/TDA handle .astype(object) correctly. - """ - from pandas.core.construction import array as pd_array, extract_array - - arr = extract_array(arr, extract_numpy=True) - if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]: - arr = pd_array(arr) - return arr - - -def _concat_sparse(to_concat, axis=0, typs=None): - """ - provide concatenation of an sparse/dense array of arrays each of which is a - single dtype - - Parameters - ---------- - to_concat : array of arrays - axis : axis to provide concatenation - typs : set of to_concat dtypes - - Returns - ------- - a single array, preserving the combined dtypes - """ - from pandas.core.arrays import SparseArray - - fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] - fill_value = fill_values[0] - - # TODO: Fix join unit generation so we aren't passed this. - to_concat = [ - x - if isinstance(x, SparseArray) - else SparseArray(x.squeeze(), fill_value=fill_value) - for x in to_concat - ] - - return SparseArray._concat_same_type(to_concat)