Skip to content

REF: put EA concat logic in _concat_arrays #33535

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
79 changes: 76 additions & 3 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ensure_platform_int,
is_categorical_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_dict_like,
is_dtype_equal,
is_extension_array_dtype,
Expand Down Expand Up @@ -2348,10 +2349,82 @@ def _can_hold_na(self):
return True

@classmethod
def _concat_same_type(self, to_concat):
from pandas.core.dtypes.concat import concat_categorical
def _concat_same_type(cls, to_concat):
return cls._concat_arrays(to_concat)
# TODO: lock down stricter behavior?

return concat_categorical(to_concat)
@classmethod
def _concat_same_dtype(
cls,
to_concat,
axis: int = 0,
sort_categories: bool = False,
ignore_order: bool = False,
):
"""
Like _concat_same_type, but with the added restriction of matching dtypes.
"""
ordered = False

first = to_concat[0]

# identical categories - fastpath
categories = first.categories
ordered = first.ordered

if all(first.categories.equals(other.categories) for other in to_concat[1:]):
new_codes = np.concatenate([c.codes for c in to_concat])
else:
codes = [first.codes] + [
recode_for_categories(other.codes, other.categories, first.categories)
for other in to_concat[1:]
]
new_codes = np.concatenate(codes)

if sort_categories and not ignore_order and ordered:
raise TypeError("Cannot use sort_categories=True with ordered Categoricals")

if sort_categories and not categories.is_monotonic_increasing:
categories = categories.sort_values()
indexer = categories.get_indexer(first.categories)

new_codes = take_1d(indexer, new_codes, fill_value=-1)

if ignore_order:
ordered = False

return cls(new_codes, categories=categories, ordered=ordered, fastpath=True)

@classmethod
def _concat_arrays(cls, to_concat, axis: int = 0):
from pandas.core.dtypes.concat import concat_compat, union_categoricals

categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]

# validate the categories
if len(categoricals) != len(to_concat):
pass
else:
# when all categories are identical
first = to_concat[0]
if all(first.is_dtype_equal(other) for other in to_concat[1:]):
return union_categoricals(categoricals)

# extract the categoricals & coerce to object if needed
to_concat = [
x._internal_get_values()
if is_categorical_dtype(x.dtype)
else np.asarray(x).ravel()
if not is_datetime64tz_dtype(x)
else np.asarray(x.astype(object))
for x in to_concat
]

result = concat_compat(to_concat)
if axis == 1:
# TODO(EA2D): this is a kludge for 1D EAs
result = result.reshape(1, len(result))
return result

def isin(self, values):
"""
Expand Down
25 changes: 25 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
is_datetime64tz_dtype,
is_datetime_or_timedelta_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_float_dtype,
is_integer_dtype,
is_list_like,
Expand Down Expand Up @@ -751,6 +752,30 @@ def _concat_same_type(cls, to_concat, axis: int = 0):

return cls._simple_new(values, dtype=dtype, freq=new_freq)

@classmethod
def _concat_arrays(cls, to_concat, axis: int = 0):
from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array

to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat]

if len({x.dtype for x in to_concat}) == 1:
if axis == 1 and is_extension_array_dtype(to_concat[0].dtype):
# TODO(EA2D): not necessary with 2D EAs
axis = 0

result = cls._concat_same_type(to_concat, axis=axis)

if axis == 1 and result.ndim == 1:
# TODO(EA2D): not necessary with 2D EAs
result = result.reshape(1, -1)
return result

to_concat = [x.astype(object) for x in to_concat]
if axis == 1:
# TODO(EA2D): not necessary with 2D EAs
to_concat = [np.atleast_2d(x) for x in to_concat]
return np.concatenate(to_concat, axis=axis)

def copy(self):
values = self.asi8.copy()
return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq)
Expand Down
13 changes: 13 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,19 @@ def _concat_same_type(cls, to_concat):

return cls(data, sparse_index=sp_index, fill_value=fill_value)

@classmethod
def _concat_arrays(cls, to_concat, axis: int = 0):
fill_values = [x.fill_value for x in to_concat if isinstance(x, cls)]
fill_value = fill_values[0]

# TODO: Fix join unit generation so we aren't passed this.
to_concat = [
x if isinstance(x, cls) else cls(x.squeeze(), fill_value=fill_value)
for x in to_concat
]

return cls._concat_same_type(to_concat)

def astype(self, dtype=None, copy=True):
"""
Change the dtype of a SparseArray.
Expand Down
173 changes: 12 additions & 161 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ def is_nonempty(x) -> bool:
_contains_datetime = any(typ.startswith("datetime") for typ in typs)
_contains_period = any(typ.startswith("period") for typ in typs)

from pandas.core.arrays import Categorical, SparseArray, datetimelike as dtl
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And to make my suggestion more concrete: instead of import Categorical here, it would be from pandas.core.arrays.categorical import _concat_arrays as concat_categorical (or whathever name we give it)

from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array

to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat]

all_empty = not len(non_empties)
single_dtype = len({x.dtype for x in to_concat}) == 1
any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat)
Expand All @@ -106,14 +111,15 @@ def is_nonempty(x) -> bool:
elif "category" in typs:
# this must be prior to concat_datetime,
# to support Categorical + datetime-like
return concat_categorical(to_concat, axis=axis)
return Categorical._concat_arrays(to_concat, axis=axis)

elif _contains_datetime or "timedelta" in typs or _contains_period:
return concat_datetime(to_concat, axis=axis, typs=typs)
obj = [x for x in to_concat if isinstance(x, dtl.DatetimeLikeArrayMixin)][0]
return type(obj)._concat_arrays(to_concat, axis=axis)

# these are mandated to handle empties as well
elif "sparse" in typs:
return _concat_sparse(to_concat, axis=axis, typs=typs)
return SparseArray._concat_arrays(to_concat, axis=axis)

elif any_ea and axis == 1:
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
Expand All @@ -136,52 +142,6 @@ def is_nonempty(x) -> bool:
return np.concatenate(to_concat, axis=axis)


def concat_categorical(to_concat, axis: int = 0):
"""
Concatenate an object/categorical array of arrays, each of which is a
single dtype

Parameters
----------
to_concat : array of arrays
axis : int
Axis to provide concatenation in the current implementation this is
always 0, e.g. we only have 1D categoricals

Returns
-------
Categorical
A single array, preserving the combined dtypes
"""
# we could have object blocks and categoricals here
# if we only have a single categoricals then combine everything
# else its a non-compat categorical
categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]

# validate the categories
if len(categoricals) != len(to_concat):
pass
else:
# when all categories are identical
first = to_concat[0]
if all(first.is_dtype_equal(other) for other in to_concat[1:]):
return union_categoricals(categoricals)

# extract the categoricals & coerce to object if needed
to_concat = [
x._internal_get_values()
if is_categorical_dtype(x.dtype)
else np.asarray(x).ravel()
if not is_datetime64tz_dtype(x)
else np.asarray(x.astype(object))
for x in to_concat
]
result = concat_compat(to_concat)
if axis == 1:
result = result.reshape(1, len(result))
return result


def union_categoricals(
to_union, sort_categories: bool = False, ignore_order: bool = False
):
Expand Down Expand Up @@ -309,28 +269,10 @@ def _maybe_unwrap(x):
ordered = False
if all(first.is_dtype_equal(other) for other in to_union[1:]):
# identical categories - fastpath
categories = first.categories
ordered = first.ordered

if all(first.categories.equals(other.categories) for other in to_union[1:]):
new_codes = np.concatenate([c.codes for c in to_union])
else:
codes = [first.codes] + [
recode_for_categories(other.codes, other.categories, first.categories)
for other in to_union[1:]
]
new_codes = np.concatenate(codes)
return Categorical._concat_same_dtype(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also move the full of union_categoricals do the categorical array module?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that would be my preference too, but trying to keep the already-broad scope/diff limited

to_union, sort_categories=sort_categories, ignore_order=ignore_order,
)

if sort_categories and not ignore_order and ordered:
raise TypeError("Cannot use sort_categories=True with ordered Categoricals")

if sort_categories and not categories.is_monotonic_increasing:
categories = categories.sort_values()
indexer = categories.get_indexer(first.categories)

from pandas.core.algorithms import take_1d

new_codes = take_1d(indexer, new_codes, fill_value=-1)
elif ignore_order or all(not c.ordered for c in to_union):
# different categories - union and recode
cats = first.categories.append([c.categories for c in to_union[1:]])
Expand All @@ -354,94 +296,3 @@ def _maybe_unwrap(x):
ordered = False

return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)


def _concatenate_2d(to_concat, axis: int):
# coerce to 2d if needed & concatenate
if axis == 1:
to_concat = [np.atleast_2d(x) for x in to_concat]
return np.concatenate(to_concat, axis=axis)


def concat_datetime(to_concat, axis=0, typs=None):
"""
provide concatenation of an datetimelike array of arrays each of which is a
single M8[ns], datetimet64[ns, tz] or m8[ns] dtype

Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
typs : set of to_concat dtypes

Returns
-------
a single array, preserving the combined dtypes
"""
if typs is None:
typs = get_dtype_kinds(to_concat)

to_concat = [_wrap_datetimelike(x) for x in to_concat]
single_dtype = len({x.dtype for x in to_concat}) == 1

# multiple types, need to coerce to object
if not single_dtype:
# wrap_datetimelike ensures that astype(object) wraps in Timestamp/Timedelta
return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis)

if axis == 1:
# TODO(EA2D): kludge not necessary with 2D EAs
to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat]

result = type(to_concat[0])._concat_same_type(to_concat, axis=axis)

if result.ndim == 2 and is_extension_array_dtype(result.dtype):
# TODO(EA2D): kludge not necessary with 2D EAs
assert result.shape[0] == 1
result = result[0]
return result


def _wrap_datetimelike(arr):
"""
Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray.

DTA/TDA handle .astype(object) correctly.
"""
from pandas.core.construction import array as pd_array, extract_array

arr = extract_array(arr, extract_numpy=True)
if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]:
arr = pd_array(arr)
return arr


def _concat_sparse(to_concat, axis=0, typs=None):
"""
provide concatenation of an sparse/dense array of arrays each of which is a
single dtype

Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
typs : set of to_concat dtypes

Returns
-------
a single array, preserving the combined dtypes
"""
from pandas.core.arrays import SparseArray

fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)]
fill_value = fill_values[0]

# TODO: Fix join unit generation so we aren't passed this.
to_concat = [
x
if isinstance(x, SparseArray)
else SparseArray(x.squeeze(), fill_value=fill_value)
for x in to_concat
]

return SparseArray._concat_same_type(to_concat)