-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REF: put EA concat logic in _concat_arrays #33535
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
78ec756
53bbd25
490c333
05d5eb7
fdd7d91
3137b60
4fab356
23d67cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -95,6 +95,11 @@ def is_nonempty(x) -> bool: | |
_contains_datetime = any(typ.startswith("datetime") for typ in typs) | ||
_contains_period = any(typ.startswith("period") for typ in typs) | ||
|
||
from pandas.core.arrays import Categorical, SparseArray, datetimelike as dtl | ||
from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array | ||
|
||
to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat] | ||
|
||
all_empty = not len(non_empties) | ||
single_dtype = len({x.dtype for x in to_concat}) == 1 | ||
any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) | ||
|
@@ -106,14 +111,15 @@ def is_nonempty(x) -> bool: | |
elif "category" in typs: | ||
# this must be prior to concat_datetime, | ||
# to support Categorical + datetime-like | ||
return concat_categorical(to_concat, axis=axis) | ||
return Categorical._concat_arrays(to_concat, axis=axis) | ||
|
||
elif _contains_datetime or "timedelta" in typs or _contains_period: | ||
return concat_datetime(to_concat, axis=axis, typs=typs) | ||
obj = [x for x in to_concat if isinstance(x, dtl.DatetimeLikeArrayMixin)][0] | ||
return type(obj)._concat_arrays(to_concat, axis=axis) | ||
|
||
# these are mandated to handle empties as well | ||
elif "sparse" in typs: | ||
return _concat_sparse(to_concat, axis=axis, typs=typs) | ||
return SparseArray._concat_arrays(to_concat, axis=axis) | ||
|
||
elif any_ea and axis == 1: | ||
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] | ||
|
@@ -136,52 +142,6 @@ def is_nonempty(x) -> bool: | |
return np.concatenate(to_concat, axis=axis) | ||
|
||
|
||
def concat_categorical(to_concat, axis: int = 0): | ||
""" | ||
Concatenate an object/categorical array of arrays, each of which is a | ||
single dtype | ||
|
||
Parameters | ||
---------- | ||
to_concat : array of arrays | ||
axis : int | ||
Axis to provide concatenation in the current implementation this is | ||
always 0, e.g. we only have 1D categoricals | ||
|
||
Returns | ||
------- | ||
Categorical | ||
A single array, preserving the combined dtypes | ||
""" | ||
# we could have object blocks and categoricals here | ||
# if we only have a single categoricals then combine everything | ||
# else its a non-compat categorical | ||
categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] | ||
|
||
# validate the categories | ||
if len(categoricals) != len(to_concat): | ||
pass | ||
else: | ||
# when all categories are identical | ||
first = to_concat[0] | ||
if all(first.is_dtype_equal(other) for other in to_concat[1:]): | ||
return union_categoricals(categoricals) | ||
|
||
# extract the categoricals & coerce to object if needed | ||
to_concat = [ | ||
x._internal_get_values() | ||
if is_categorical_dtype(x.dtype) | ||
else np.asarray(x).ravel() | ||
if not is_datetime64tz_dtype(x) | ||
else np.asarray(x.astype(object)) | ||
for x in to_concat | ||
] | ||
result = concat_compat(to_concat) | ||
if axis == 1: | ||
result = result.reshape(1, len(result)) | ||
return result | ||
|
||
|
||
def union_categoricals( | ||
to_union, sort_categories: bool = False, ignore_order: bool = False | ||
): | ||
|
@@ -309,28 +269,10 @@ def _maybe_unwrap(x): | |
ordered = False | ||
if all(first.is_dtype_equal(other) for other in to_union[1:]): | ||
# identical categories - fastpath | ||
categories = first.categories | ||
ordered = first.ordered | ||
|
||
if all(first.categories.equals(other.categories) for other in to_union[1:]): | ||
new_codes = np.concatenate([c.codes for c in to_union]) | ||
else: | ||
codes = [first.codes] + [ | ||
recode_for_categories(other.codes, other.categories, first.categories) | ||
for other in to_union[1:] | ||
] | ||
new_codes = np.concatenate(codes) | ||
return Categorical._concat_same_dtype( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could also move the full of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that would be my preference too, but trying to keep the already-broad scope/diff limited |
||
to_union, sort_categories=sort_categories, ignore_order=ignore_order, | ||
) | ||
|
||
if sort_categories and not ignore_order and ordered: | ||
raise TypeError("Cannot use sort_categories=True with ordered Categoricals") | ||
|
||
if sort_categories and not categories.is_monotonic_increasing: | ||
categories = categories.sort_values() | ||
indexer = categories.get_indexer(first.categories) | ||
|
||
from pandas.core.algorithms import take_1d | ||
|
||
new_codes = take_1d(indexer, new_codes, fill_value=-1) | ||
elif ignore_order or all(not c.ordered for c in to_union): | ||
# different categories - union and recode | ||
cats = first.categories.append([c.categories for c in to_union[1:]]) | ||
|
@@ -354,94 +296,3 @@ def _maybe_unwrap(x): | |
ordered = False | ||
|
||
return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) | ||
|
||
|
||
def _concatenate_2d(to_concat, axis: int): | ||
# coerce to 2d if needed & concatenate | ||
if axis == 1: | ||
to_concat = [np.atleast_2d(x) for x in to_concat] | ||
return np.concatenate(to_concat, axis=axis) | ||
|
||
|
||
def concat_datetime(to_concat, axis=0, typs=None): | ||
""" | ||
provide concatenation of an datetimelike array of arrays each of which is a | ||
single M8[ns], datetimet64[ns, tz] or m8[ns] dtype | ||
|
||
Parameters | ||
---------- | ||
to_concat : array of arrays | ||
axis : axis to provide concatenation | ||
typs : set of to_concat dtypes | ||
|
||
Returns | ||
------- | ||
a single array, preserving the combined dtypes | ||
""" | ||
if typs is None: | ||
typs = get_dtype_kinds(to_concat) | ||
|
||
to_concat = [_wrap_datetimelike(x) for x in to_concat] | ||
single_dtype = len({x.dtype for x in to_concat}) == 1 | ||
|
||
# multiple types, need to coerce to object | ||
if not single_dtype: | ||
# wrap_datetimelike ensures that astype(object) wraps in Timestamp/Timedelta | ||
return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) | ||
|
||
if axis == 1: | ||
# TODO(EA2D): kludge not necessary with 2D EAs | ||
to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat] | ||
|
||
result = type(to_concat[0])._concat_same_type(to_concat, axis=axis) | ||
|
||
if result.ndim == 2 and is_extension_array_dtype(result.dtype): | ||
# TODO(EA2D): kludge not necessary with 2D EAs | ||
assert result.shape[0] == 1 | ||
result = result[0] | ||
return result | ||
|
||
|
||
def _wrap_datetimelike(arr): | ||
""" | ||
Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray. | ||
|
||
DTA/TDA handle .astype(object) correctly. | ||
""" | ||
from pandas.core.construction import array as pd_array, extract_array | ||
|
||
arr = extract_array(arr, extract_numpy=True) | ||
if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]: | ||
arr = pd_array(arr) | ||
return arr | ||
|
||
|
||
def _concat_sparse(to_concat, axis=0, typs=None): | ||
""" | ||
provide concatenation of an sparse/dense array of arrays each of which is a | ||
single dtype | ||
|
||
Parameters | ||
---------- | ||
to_concat : array of arrays | ||
axis : axis to provide concatenation | ||
typs : set of to_concat dtypes | ||
|
||
Returns | ||
------- | ||
a single array, preserving the combined dtypes | ||
""" | ||
from pandas.core.arrays import SparseArray | ||
|
||
fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] | ||
fill_value = fill_values[0] | ||
|
||
# TODO: Fix join unit generation so we aren't passed this. | ||
to_concat = [ | ||
x | ||
if isinstance(x, SparseArray) | ||
else SparseArray(x.squeeze(), fill_value=fill_value) | ||
for x in to_concat | ||
] | ||
|
||
return SparseArray._concat_same_type(to_concat) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And to make my suggestion more concrete: instead of import Categorical here, it would be
from pandas.core.arrays.categorical import _concat_arrays as concat_categorical
(or whathever name we give it)