Skip to content

REF: use concat_compat in union_with_duplicates #44125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
is_timedelta64_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import PandasDtype
from pandas.core.dtypes.generic import (
ABCDatetimeArray,
Expand Down Expand Up @@ -1834,17 +1835,18 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
-------
np.ndarray or ExtensionArray
Containing the unsorted union of both arrays.

Notes
-----
Caller is responsible for ensuring lvals.dtype == rvals.dtype.
"""
indexer = []
l_count = value_counts(lvals, dropna=False)
r_count = value_counts(rvals, dropna=False)
l_count, r_count = l_count.align(r_count, fill_value=0)
unique_array = unique(np.append(lvals, rvals))
if not isinstance(lvals, np.ndarray):
# i.e. ExtensionArray
# Note: we only get here with lvals.dtype == rvals.dtype
# TODO: are there any cases where union won't be type/dtype preserving?
unique_array = type(lvals)._from_sequence(unique_array, dtype=lvals.dtype)
unique_array = unique(concat_compat([lvals, rvals]))
unique_array = ensure_wrapped_if_datetimelike(unique_array)

for i, value in enumerate(unique_array):
indexer += [i] * int(max(l_count[value], r_count[value]))
return unique_array.take(indexer)
45 changes: 18 additions & 27 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
"""
Utility functions related to concat.
"""
from typing import cast
from typing import (
TYPE_CHECKING,
cast,
)

import numpy as np

Expand All @@ -10,7 +13,10 @@
DtypeObj,
)

from pandas.core.dtypes.cast import find_common_type
from pandas.core.dtypes.cast import (
astype_array,
find_common_type,
)
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_dtype_equal,
Expand All @@ -19,15 +25,12 @@
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import (
ABCCategoricalIndex,
ABCExtensionArray,
ABCSeries,
)

from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse import SparseArray
from pandas.core.construction import (
array as pd_array,
ensure_wrapped_if_datetimelike,
)
if TYPE_CHECKING:
from pandas.core.arrays.sparse import SparseArray


def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
Expand Down Expand Up @@ -59,26 +62,11 @@ def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
# SupportsDType[dtype[Any]], str, Union[Tuple[Any, int], Tuple[Any,
# Union[SupportsIndex, Sequence[SupportsIndex]]], List[Any], _DTypeDict,
# Tuple[Any, Any]]]" [arg-type]
arr = cast(SparseArray, arr)
arr = cast("SparseArray", arr)
return arr.to_dense().astype(dtype, copy=False) # type: ignore[arg-type]

if (
isinstance(arr, np.ndarray)
and arr.dtype.kind in ["m", "M"]
and dtype is np.dtype("object")
):
# wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta
# this can happen when concat_compat is called directly on arrays (when arrays
# are not coming from Index/Series._values), eg in BlockManager.quantile
arr = ensure_wrapped_if_datetimelike(arr)

if isinstance(dtype, ExtensionDtype):
if isinstance(arr, np.ndarray):
# numpy's astype cannot handle ExtensionDtypes
return pd_array(arr, dtype=dtype, copy=False)
return arr.astype(dtype, copy=False)

return arr.astype(dtype, copy=False)
# astype_array includes ensure_wrapped_if_datetimelike
return astype_array(arr, dtype=dtype, copy=False)


def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False):
Expand Down Expand Up @@ -135,7 +123,8 @@ def is_nonempty(x) -> bool:
target_dtype = find_common_type([x.dtype for x in to_concat])
to_concat = [cast_to_common_type(arr, target_dtype) for arr in to_concat]

if isinstance(to_concat[0], ExtensionArray):
if isinstance(to_concat[0], ABCExtensionArray):
# TODO: what about EA-backed Index?
cls = type(to_concat[0])
return cls._concat_same_type(to_concat)
else:
Expand Down Expand Up @@ -346,6 +335,8 @@ def _concat_datetime(to_concat, axis=0):
-------
a single array, preserving the combined dtypes
"""
from pandas.core.construction import ensure_wrapped_if_datetimelike

to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat]

single_dtype = len({x.dtype for x in to_concat}) == 1
Expand Down