Skip to content

REF/PERF: pd.concat #52312

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9347,7 +9347,13 @@ def compare(
else:
axis = self._get_axis_number(align_axis)

diff = concat([self, other], axis=axis, keys=result_names)
# error: List item 0 has incompatible type "NDFrame"; expected
# "Union[Series, DataFrame]"
diff = concat(
[self, other], # type: ignore[list-item]
axis=axis,
keys=result_names,
)

if axis >= self.ndim:
# No need to reorganize data if stacking on new axis
Expand Down
211 changes: 125 additions & 86 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@
DataFrame,
Series,
)
from pandas.core.generic import NDFrame

# ---------------------------------------------------------------------
# Concatenate DataFrame objects
Expand Down Expand Up @@ -98,7 +97,7 @@ def concat(

@overload
def concat(
objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
*,
axis: Literal[0, "index"] = ...,
join: str = ...,
Expand All @@ -115,7 +114,7 @@ def concat(

@overload
def concat(
objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
*,
axis: Literal[1, "columns"],
join: str = ...,
Expand All @@ -132,7 +131,7 @@ def concat(

@overload
def concat(
objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
*,
axis: Axis = ...,
join: str = ...,
Expand All @@ -148,7 +147,7 @@ def concat(


def concat(
objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
*,
axis: Axis = 0,
join: str = "outer",
Expand Down Expand Up @@ -395,7 +394,7 @@ class _Concatenator:

def __init__(
self,
objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
axis: Axis = 0,
join: str = "outer",
keys=None,
Expand All @@ -421,6 +420,72 @@ def __init__(
"Only can inner (intersect) or outer (union) join the other axis"
)

if not is_bool(sort):
raise ValueError(
f"The 'sort' keyword only accepts boolean values; {sort} was passed."
)
# Incompatible types in assignment (expression has type "Union[bool, bool_]",
# variable has type "bool")
self.sort = sort # type: ignore[assignment]

self.ignore_index = ignore_index
self.verify_integrity = verify_integrity
self.copy = copy

objs, keys = self._clean_keys_and_objs(objs, keys)

# figure out what our result ndim is going to be
ndims = self._get_ndims(objs)
sample, objs = self._get_sample_object(objs, ndims, keys, names, levels)

# Standardize axis parameter to int
if sample.ndim == 1:
from pandas import DataFrame

axis = DataFrame._get_axis_number(axis)
self._is_frame = False
self._is_series = True
else:
axis = sample._get_axis_number(axis)
self._is_frame = True
self._is_series = False

# Need to flip BlockManager axis in the DataFrame special case
axis = sample._get_block_manager_axis(axis)

# if we have mixed ndims, then convert to highest ndim
# creating column numbers as needed
if len(ndims) > 1:
objs, sample = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis)

self.objs = objs

# note: this is the BlockManager axis (since DataFrame is transposed)
self.bm_axis = axis
self.axis = 1 - self.bm_axis if self._is_frame else 0
self.keys = keys
self.names = names or getattr(keys, "names", None)
self.levels = levels

def _get_ndims(self, objs: list[Series | DataFrame]) -> set[int]:
# figure out what our result ndim is going to be
ndims = set()
for obj in objs:
if not isinstance(obj, (ABCSeries, ABCDataFrame)):
msg = (
f"cannot concatenate object of type '{type(obj)}'; "
"only Series and DataFrame objs are valid"
)
raise TypeError(msg)

ndims.add(obj.ndim)
return ndims

def _clean_keys_and_objs(
self,
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
keys,
) -> tuple[list[Series | DataFrame], Index | None]:
if isinstance(objs, abc.Mapping):
if keys is None:
keys = list(objs.keys())
Expand All @@ -434,7 +499,7 @@ def __init__(
if keys is None:
objs = list(com.not_none(*objs))
else:
# #1649
# GH#1649
clean_keys = []
clean_objs = []
for k, v in zip(keys, objs):
Expand All @@ -454,22 +519,20 @@ def __init__(
if len(objs) == 0:
raise ValueError("All objects passed were None")

# figure out what our result ndim is going to be
ndims = set()
for obj in objs:
if not isinstance(obj, (ABCSeries, ABCDataFrame)):
msg = (
f"cannot concatenate object of type '{type(obj)}'; "
"only Series and DataFrame objs are valid"
)
raise TypeError(msg)

ndims.add(obj.ndim)
return objs, keys

def _get_sample_object(
self,
objs: list[Series | DataFrame],
ndims: set[int],
keys,
names,
levels,
) -> tuple[Series | DataFrame, list[Series | DataFrame]]:
# get the sample
# want the highest ndim that we have, and must be non-empty
# unless all objs are empty
sample: NDFrame | None = None
sample: Series | DataFrame | None = None
if len(ndims) > 1:
max_ndim = max(ndims)
for obj in objs:
Expand All @@ -490,82 +553,48 @@ def __init__(

if sample is None:
sample = objs[0]
self.objs = objs

# Standardize axis parameter to int
if sample.ndim == 1:
from pandas import DataFrame

axis = DataFrame._get_axis_number(axis)
self._is_frame = False
self._is_series = True
else:
axis = sample._get_axis_number(axis)
self._is_frame = True
self._is_series = False

# Need to flip BlockManager axis in the DataFrame special case
if self._is_frame:
axis = sample._get_block_manager_axis(axis)

if not 0 <= axis <= sample.ndim:
raise AssertionError(
f"axis must be between 0 and {sample.ndim}, input was {axis}"
)
return sample, objs

def _sanitize_mixed_ndim(
self,
objs: list[Series | DataFrame],
sample: Series | DataFrame,
ignore_index: bool,
axis: AxisInt,
) -> tuple[list[Series | DataFrame], Series | DataFrame]:
# if we have mixed ndims, then convert to highest ndim
# creating column numbers as needed
if len(ndims) > 1:
current_column = 0
max_ndim = sample.ndim
self.objs, objs = [], self.objs
for obj in objs:
ndim = obj.ndim
if ndim == max_ndim:
pass

elif ndim != max_ndim - 1:
raise ValueError(
"cannot concatenate unaligned mixed "
"dimensional NDFrame objects"
)
new_objs = []

else:
name = getattr(obj, "name", None)
if ignore_index or name is None:
name = current_column
current_column += 1
current_column = 0
max_ndim = sample.ndim
for obj in objs:
ndim = obj.ndim
if ndim == max_ndim:
pass

# doing a row-wise concatenation so need everything
# to line up
if self._is_frame and axis == 1:
name = 0
# mypy needs to know sample is not an NDFrame
sample = cast("DataFrame | Series", sample)
obj = sample._constructor({name: obj}, copy=False)
elif ndim != max_ndim - 1:
raise ValueError(
"cannot concatenate unaligned mixed dimensional NDFrame objects"
)

self.objs.append(obj)
else:
name = getattr(obj, "name", None)
if ignore_index or name is None:
name = current_column
current_column += 1

# note: this is the BlockManager axis (since DataFrame is transposed)
self.bm_axis = axis
self.axis = 1 - self.bm_axis if self._is_frame else 0
self.keys = keys
self.names = names or getattr(keys, "names", None)
self.levels = levels
# doing a row-wise concatenation so need everything
# to line up
if self._is_frame and axis == 1:
name = 0

if not is_bool(sort):
raise ValueError(
f"The 'sort' keyword only accepts boolean values; {sort} was passed."
)
# Incompatible types in assignment (expression has type "Union[bool, bool_]",
# variable has type "bool")
self.sort = sort # type: ignore[assignment]
obj = sample._constructor({name: obj}, copy=False)

self.ignore_index = ignore_index
self.verify_integrity = verify_integrity
self.copy = copy
new_objs.append(obj)

self.new_axes = self._get_new_axes()
return new_objs, sample

def get_result(self):
cons: Callable[..., DataFrame | Series]
Expand All @@ -583,7 +612,16 @@ def get_result(self):
arrs = [ser._values for ser in self.objs]

res = concat_compat(arrs, axis=0)
mgr = type(sample._mgr).from_array(res, index=self.new_axes[0])

new_index: Index
if self.ignore_index:
# We can avoid surprisingly-expensive _get_concat_axis
new_index = default_index(len(res))
else:
new_index = self.new_axes[0]

mgr = type(sample._mgr).from_array(res, index=new_index)

result = cons(mgr, name=name, fastpath=True)
return result.__finalize__(self, method="concat")

Expand Down Expand Up @@ -634,7 +672,8 @@ def _get_result_dim(self) -> int:
else:
return self.objs[0].ndim

def _get_new_axes(self) -> list[Index]:
@cache_readonly
def new_axes(self) -> list[Index]:
ndim = self._get_result_dim()
return [
self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i)
Expand Down