From 90e966e66fa02a34e86f1a4b1a3861c4be94f733 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jun 2022 20:03:17 +0200 Subject: [PATCH 01/12] Revert "REF: remove JoinUnit.shape (#43651)" This reverts commit bb9a9852265915a4688f772dd062d3fcf4159a32. --- pandas/core/internals/concat.py | 56 +++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 228d57fe196a4..991e4bbf4fbdb 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -212,8 +212,6 @@ def concatenate_managers( for placement, join_units in concat_plan: unit = join_units[0] blk = unit.block - # Assertion disabled for performance - # assert len(join_units) == len(mgrs_indexers) if len(join_units) == 1: values = blk.values @@ -331,10 +329,14 @@ def _get_mgr_concatenation_plan(mgr: BlockManager): plan : list of (BlockPlacement, JoinUnit) tuples """ + # Calculate post-reindex shape , save for item axis which will be separate + # for each block anyway. + mgr_shape_list = list(mgr.shape) + mgr_shape = tuple(mgr_shape_list) if mgr.is_single_block: blk = mgr.blocks[0] - return [(blk.mgr_locs, JoinUnit(blk))] + return [(blk.mgr_locs, JoinUnit(blk, mgr_shape))] blknos = mgr.blknos blklocs = mgr.blklocs @@ -342,9 +344,12 @@ def _get_mgr_concatenation_plan(mgr: BlockManager): plan = [] for blkno, placements in libinternals.get_blkno_placements(blknos, group=False): - # Assertions disabled for performance; these should always hold - # assert placements.is_slice_like - # assert blkno != -1 + assert placements.is_slice_like + assert blkno != -1 + + shape_list = list(mgr_shape) + shape_list[0] = len(placements) + shape = tuple(shape_list) blk = mgr.blocks[blkno] ax0_blk_indexer = blklocs[placements.indexer] @@ -374,7 +379,8 @@ def _get_mgr_concatenation_plan(mgr: BlockManager): # Assertions disabled for performance # assert blk._mgr_locs.as_slice == placements.as_slice - unit = JoinUnit(blk) + # assert blk.shape[0] == shape[0] + unit = JoinUnit(blk, shape) plan.append((placements, unit)) @@ -382,8 +388,10 @@ def _get_mgr_concatenation_plan(mgr: BlockManager): class JoinUnit: - def __init__(self, block: Block) -> None: + def __init__(self, block: Block, shape: Shape): + # Passing shape explicitly is required for cases when block is None. self.block = block + self.shape = shape def __repr__(self) -> str: return f"{type(self).__name__}({repr(self.block)})" @@ -396,11 +404,22 @@ def is_na(self) -> bool: return False def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike: + values: ArrayLike + if self.is_na: - return make_na_array(empty_dtype, self.block.shape) + return make_na_array(empty_dtype, self.shape) else: - return self.block.values + + if not self.block._can_consolidate: + # preserve these for validation in concat_compat + return self.block.values + + # No dtype upcasting is done here, it will be performed during + # concatenation itself. + values = self.block.values + + return values def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike: @@ -539,9 +558,6 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: first = join_units[0].block if first.dtype.kind == "V": return False - elif len(join_units) == 1: - # only use this path when there is something to concatenate - return False return ( # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64 all(type(ju.block) is type(first) for ju in join_units) @@ -554,8 +570,13 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: or ju.block.dtype.kind in ["b", "i", "u"] for ju in join_units ) - # this also precludes any blocks with dtype.kind == "V", since - # we excluded that case for `first` above. + and + # no blocks that would get missing values (can lead to type upcasts) + # unless we're an extension dtype. + all(not ju.is_na or ju.block.is_extension for ju in join_units) + and + # only use this path when there is something to concatenate + len(join_units) > 1 ) @@ -577,7 +598,10 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: extra_block = join_unit.block.getitem_block(slice(length, None)) join_unit.block = join_unit.block.getitem_block(slice(length)) - return JoinUnit(block=extra_block) + extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] + join_unit.shape = (length,) + join_unit.shape[1:] + + return JoinUnit(block=extra_block, shape=extra_shape) def _combine_concat_plans(plans): From b0231a68803e80687a836863af73d02c7576f840 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jun 2022 20:05:45 +0200 Subject: [PATCH 02/12] Revert "REF: concat on bm_axis==0 (#43626)" This reverts commit 0de6f8bfef84f29b29b3a74a5833ce5719aaa423. --- pandas/core/internals/concat.py | 177 ++++++++++++++++++-------------- 1 file changed, 100 insertions(+), 77 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 991e4bbf4fbdb..f2b0823632c2f 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -36,6 +36,7 @@ ) from pandas.core.dtypes.dtypes import ExtensionDtype +import pandas.core.algorithms as algos from pandas.core.arrays import ( DatetimeArray, ExtensionArray, @@ -191,29 +192,19 @@ def concatenate_managers( if isinstance(mgrs_indexers[0][0], ArrayManager): return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) - # Assertions disabled for performance - # for tup in mgrs_indexers: - # # caller is responsible for ensuring this - # indexers = tup[1] - # assert concat_axis not in indexers - - if concat_axis == 0: - return _concat_managers_axis0(mgrs_indexers, axes, copy) - mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) - # Assertion disabled for performance - # assert all(not x[1] for x in mgrs_indexers) - - concat_plans = [_get_mgr_concatenation_plan(mgr) for mgr, _ in mgrs_indexers] - concat_plan = _combine_concat_plans(concat_plans) + concat_plans = [ + _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers + ] + concat_plan = _combine_concat_plans(concat_plans, concat_axis) blocks = [] for placement, join_units in concat_plan: unit = join_units[0] blk = unit.block - if len(join_units) == 1: + if len(join_units) == 1 and not join_units[0].indexers: values = blk.values if copy: values = values.copy() @@ -237,7 +228,7 @@ def concatenate_managers( fastpath = blk.values.dtype == values.dtype else: - values = _concatenate_join_units(join_units, copy=copy) + values = _concatenate_join_units(join_units, concat_axis, copy=copy) fastpath = False if fastpath: @@ -250,42 +241,6 @@ def concatenate_managers( return BlockManager(tuple(blocks), axes) -def _concat_managers_axis0( - mgrs_indexers, axes: list[Index], copy: bool -) -> BlockManager: - """ - concat_managers specialized to concat_axis=0, with reindexing already - having been done in _maybe_reindex_columns_na_proxy. - """ - had_reindexers = { - i: len(mgrs_indexers[i][1]) > 0 for i in range(len(mgrs_indexers)) - } - mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) - - mgrs = [x[0] for x in mgrs_indexers] - - offset = 0 - blocks = [] - for i, mgr in enumerate(mgrs): - # If we already reindexed, then we definitely don't need another copy - made_copy = had_reindexers[i] - - for blk in mgr.blocks: - if made_copy: - nb = blk.copy(deep=False) - elif copy: - nb = blk.copy() - else: - # by slicing instead of copy(deep=False), we get a new array - # object, see test_concat_copy - nb = blk.getitem_block(slice(None)) - nb._mgr_locs = nb._mgr_locs.add(offset) - blocks.append(nb) - - offset += len(mgr.items) - return BlockManager(tuple(blocks), axes) - - def _maybe_reindex_columns_na_proxy( axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] ) -> list[tuple[BlockManager, dict[int, np.ndarray]]]: @@ -296,33 +251,36 @@ def _maybe_reindex_columns_na_proxy( Columns added in this reindexing have dtype=np.void, indicating they should be ignored when choosing a column's final dtype. """ - new_mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] = [] - + new_mgrs_indexers = [] for mgr, indexers in mgrs_indexers: - # For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this - # is a cheap reindexing. - for i, indexer in indexers.items(): - mgr = mgr.reindex_indexer( - axes[i], - indexers[i], - axis=i, + # We only reindex for axis=0 (i.e. columns), as this can be done cheaply + if 0 in indexers: + new_mgr = mgr.reindex_indexer( + axes[0], + indexers[0], + axis=0, copy=False, - only_slice=True, # only relevant for i==0 + only_slice=True, allow_dups=True, - use_na_proxy=True, # only relevant for i==0 + use_na_proxy=True, ) - new_mgrs_indexers.append((mgr, {})) + new_indexers = indexers.copy() + del new_indexers[0] + new_mgrs_indexers.append((new_mgr, new_indexers)) + else: + new_mgrs_indexers.append((mgr, indexers)) return new_mgrs_indexers -def _get_mgr_concatenation_plan(mgr: BlockManager): +def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarray]): """ - Construct concatenation plan for given block manager. + Construct concatenation plan for given block manager and indexers. Parameters ---------- mgr : BlockManager + indexers : dict of {axis: indexer} Returns ------- @@ -332,11 +290,27 @@ def _get_mgr_concatenation_plan(mgr: BlockManager): # Calculate post-reindex shape , save for item axis which will be separate # for each block anyway. mgr_shape_list = list(mgr.shape) + for ax, indexer in indexers.items(): + mgr_shape_list[ax] = len(indexer) mgr_shape = tuple(mgr_shape_list) + assert 0 not in indexers + + needs_filling = False + if 1 in indexers: + # indexers[1] is shared by all the JoinUnits, so we can save time + # by only doing this check once + if (indexers[1] == -1).any(): + needs_filling = True + if mgr.is_single_block: blk = mgr.blocks[0] - return [(blk.mgr_locs, JoinUnit(blk, mgr_shape))] + return [ + ( + blk.mgr_locs, + JoinUnit(blk, mgr_shape, indexers, needs_filling=needs_filling), + ) + ] blknos = mgr.blknos blklocs = mgr.blklocs @@ -347,6 +321,8 @@ def _get_mgr_concatenation_plan(mgr: BlockManager): assert placements.is_slice_like assert blkno != -1 + join_unit_indexers = indexers.copy() + shape_list = list(mgr_shape) shape_list[0] = len(placements) shape = tuple(shape_list) @@ -380,7 +356,7 @@ def _get_mgr_concatenation_plan(mgr: BlockManager): # Assertions disabled for performance # assert blk._mgr_locs.as_slice == placements.as_slice # assert blk.shape[0] == shape[0] - unit = JoinUnit(blk, shape) + unit = JoinUnit(blk, shape, join_unit_indexers, needs_filling=needs_filling) plan.append((placements, unit)) @@ -388,13 +364,22 @@ def _get_mgr_concatenation_plan(mgr: BlockManager): class JoinUnit: - def __init__(self, block: Block, shape: Shape): + def __init__( + self, block: Block, shape: Shape, indexers=None, *, needs_filling: bool = False + ): # Passing shape explicitly is required for cases when block is None. + # Note: block is None implies indexers is None, but not vice-versa + if indexers is None: + indexers = {} + # we should *never* have `0 in indexers` self.block = block + self.indexers = indexers self.shape = shape + self.needs_filling = needs_filling + def __repr__(self) -> str: - return f"{type(self).__name__}({repr(self.block)})" + return f"{type(self).__name__}({repr(self.block)}, {self.indexers})" @cache_readonly def is_na(self) -> bool: @@ -411,7 +396,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike: else: - if not self.block._can_consolidate: + if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values @@ -419,6 +404,16 @@ def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike: # concatenation itself. values = self.block.values + if not self.indexers: + # If there's no indexing to be done, we want to signal outside + # code that this array must be copied explicitly. This is done + # by returning a view and checking `retval.base`. + values = values.view() + + else: + for ax, indexer in self.indexers.items(): + values = algos.take_nd(values, indexer, axis=ax) + return values @@ -456,10 +451,15 @@ def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike: return missing_arr -def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike: +def _concatenate_join_units( + join_units: list[JoinUnit], concat_axis: int, copy: bool +) -> ArrayLike: """ - Concatenate values from several join units along axis=1. + Concatenate values from several join units along selected axis. """ + if concat_axis == 0 and len(join_units) > 1: + # Concatenating join units along ax0 is handled in _merge_blocks. + raise AssertionError("Concatenating join units along axis0") empty_dtype = _get_empty_dtype(join_units) @@ -495,7 +495,7 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike concat_values = ensure_block_shape(concat_values, 2) else: - concat_values = concat_compat(to_concat, axis=1) + concat_values = concat_compat(to_concat, axis=concat_axis) return concat_values @@ -538,7 +538,7 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: empty_dtype = join_units[0].block.dtype return empty_dtype - needs_can_hold_na = any(unit.is_na for unit in join_units) + needs_can_hold_na = any(unit.is_na or unit.needs_filling for unit in join_units) dtypes = [unit.block.dtype for unit in join_units if not unit.is_na] @@ -575,6 +575,9 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: # unless we're an extension dtype. all(not ju.is_na or ju.block.is_extension for ju in join_units) and + # no blocks with indexers (as then the dimensions do not fit) + all(not ju.indexers for ju in join_units) + and # only use this path when there is something to concatenate len(join_units) > 1 ) @@ -594,6 +597,8 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: Extra items that didn't fit are returned as a separate block. """ + assert 0 not in join_unit.indexers + extra_indexers = join_unit.indexers extra_block = join_unit.block.getitem_block(slice(length, None)) join_unit.block = join_unit.block.getitem_block(slice(length)) @@ -601,10 +606,16 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] join_unit.shape = (length,) + join_unit.shape[1:] - return JoinUnit(block=extra_block, shape=extra_shape) + # extra_indexers does not introduce any -1s, so we can inherit needs_filling + return JoinUnit( + block=extra_block, + indexers=extra_indexers, + shape=extra_shape, + needs_filling=join_unit.needs_filling, + ) -def _combine_concat_plans(plans): +def _combine_concat_plans(plans, concat_axis: int): """ Combine multiple concatenation plans into one. @@ -614,6 +625,18 @@ def _combine_concat_plans(plans): for p in plans[0]: yield p[0], [p[1]] + elif concat_axis == 0: + offset = 0 + for plan in plans: + last_plc = None + + for plc, unit in plan: + yield plc.add(offset), [unit] + last_plc = plc + + if last_plc is not None: + offset += last_plc.as_slice.stop + else: # singleton list so we can modify it as a side-effect within _next_or_none num_ended = [0] From e785ff6d4da5f14c4637e6cd3851f1f5f751a3c0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jun 2022 20:06:13 +0200 Subject: [PATCH 03/12] Revert "REF: pre-compute JoinUnit.needs_filling (#43590)" This reverts commit 7036de35378d9db6236de2d70fe5f104b0bcdc9c. --- pandas/core/internals/concat.py | 55 ++++++++++++++++----------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index f2b0823632c2f..0c85c1176d0a7 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -296,21 +296,9 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra assert 0 not in indexers - needs_filling = False - if 1 in indexers: - # indexers[1] is shared by all the JoinUnits, so we can save time - # by only doing this check once - if (indexers[1] == -1).any(): - needs_filling = True - if mgr.is_single_block: blk = mgr.blocks[0] - return [ - ( - blk.mgr_locs, - JoinUnit(blk, mgr_shape, indexers, needs_filling=needs_filling), - ) - ] + return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] blknos = mgr.blknos blklocs = mgr.blklocs @@ -356,7 +344,7 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra # Assertions disabled for performance # assert blk._mgr_locs.as_slice == placements.as_slice # assert blk.shape[0] == shape[0] - unit = JoinUnit(blk, shape, join_unit_indexers, needs_filling=needs_filling) + unit = JoinUnit(blk, shape, join_unit_indexers) plan.append((placements, unit)) @@ -364,9 +352,7 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra class JoinUnit: - def __init__( - self, block: Block, shape: Shape, indexers=None, *, needs_filling: bool = False - ): + def __init__(self, block: Block, shape: Shape, indexers=None): # Passing shape explicitly is required for cases when block is None. # Note: block is None implies indexers is None, but not vice-versa if indexers is None: @@ -376,11 +362,28 @@ def __init__( self.indexers = indexers self.shape = shape - self.needs_filling = needs_filling - def __repr__(self) -> str: return f"{type(self).__name__}({repr(self.block)}, {self.indexers})" + @cache_readonly + def needs_filling(self) -> bool: + for indexer in self.indexers.values(): + # FIXME: cache results of indexer == -1 checks. + if (indexer == -1).any(): + return True + + return False + + @cache_readonly + def dtype(self): + blk = self.block + if blk.values.dtype.kind == "V": + raise AssertionError("Block is None, no dtype") + + if not self.needs_filling: + return blk.dtype + return ensure_dtype_can_hold_na(blk.dtype) + @cache_readonly def is_na(self) -> bool: blk = self.block @@ -538,12 +541,12 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: empty_dtype = join_units[0].block.dtype return empty_dtype - needs_can_hold_na = any(unit.is_na or unit.needs_filling for unit in join_units) + has_none_blocks = any(unit.is_na for unit in join_units) - dtypes = [unit.block.dtype for unit in join_units if not unit.is_na] + dtypes = [unit.dtype for unit in join_units if not unit.is_na] dtype = find_common_type(dtypes) - if needs_can_hold_na: + if has_none_blocks: dtype = ensure_dtype_can_hold_na(dtype) return dtype @@ -606,13 +609,7 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] join_unit.shape = (length,) + join_unit.shape[1:] - # extra_indexers does not introduce any -1s, so we can inherit needs_filling - return JoinUnit( - block=extra_block, - indexers=extra_indexers, - shape=extra_shape, - needs_filling=join_unit.needs_filling, - ) + return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape) def _combine_concat_plans(plans, concat_axis: int): From b0fe1f0b844ecda61f8a50259565c4aaee2144c6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jun 2022 20:06:30 +0200 Subject: [PATCH 04/12] Revert "REF: implement make_na_array (#43606)" This reverts commit 4bb4b5243cf1dd81879ea407aa5f7961dde4c686. --- pandas/core/internals/concat.py | 98 +++++++++++++++++---------------- 1 file changed, 52 insertions(+), 46 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 0c85c1176d0a7..01797c3366410 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -9,10 +9,7 @@ import numpy as np -from pandas._libs import ( - NaT, - internals as libinternals, -) +from pandas._libs import internals as libinternals from pandas._typing import ( ArrayLike, DtypeObj, @@ -391,21 +388,59 @@ def is_na(self) -> bool: return True return False - def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike: + def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: values: ArrayLike - if self.is_na: - return make_na_array(empty_dtype, self.shape) - + if upcasted_na is None and not self.is_na: + # No upcasting is necessary + fill_value = self.block.fill_value + values = self.block.get_values() else: + fill_value = upcasted_na + + if self.is_na: + + if is_datetime64tz_dtype(empty_dtype): + i8values = np.full(self.shape, fill_value.value) + return DatetimeArray(i8values, dtype=empty_dtype) + + elif is_1d_only_ea_dtype(empty_dtype): + empty_dtype = cast(ExtensionDtype, empty_dtype) + cls = empty_dtype.construct_array_type() + + missing_arr = cls._from_sequence([], dtype=empty_dtype) + ncols, nrows = self.shape + assert ncols == 1, ncols + empty_arr = -1 * np.ones((nrows,), dtype=np.intp) + return missing_arr.take( + empty_arr, allow_fill=True, fill_value=fill_value + ) + elif isinstance(empty_dtype, ExtensionDtype): + # TODO: no tests get here, a handful would if we disabled + # the dt64tz special-case above (which is faster) + cls = empty_dtype.construct_array_type() + missing_arr = cls._empty(shape=self.shape, dtype=empty_dtype) + missing_arr[:] = fill_value + return missing_arr + else: + # NB: we should never get here with empty_dtype integer or bool; + # if we did, the missing_arr.fill would cast to gibberish + missing_arr = np.empty(self.shape, dtype=empty_dtype) + missing_arr.fill(fill_value) + return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values - # No dtype upcasting is done here, it will be performed during - # concatenation itself. - values = self.block.values + if self.block.is_bool: + # External code requested filling/upcasting, bool values must + # be upcasted to object to avoid being upcasted to numeric. + values = self.block.astype(np.object_).values + else: + # No dtype upcasting is done here, it will be performed during + # concatenation itself. + values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside @@ -420,40 +455,6 @@ def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike: return values -def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike: - """ - Construct an np.ndarray or ExtensionArray of the given dtype and shape - holding all-NA values. - """ - if is_datetime64tz_dtype(dtype): - # NaT here is analogous to dtype.na_value below - i8values = np.full(shape, NaT.value) - return DatetimeArray(i8values, dtype=dtype) - - elif is_1d_only_ea_dtype(dtype): - dtype = cast(ExtensionDtype, dtype) - cls = dtype.construct_array_type() - - missing_arr = cls._from_sequence([], dtype=dtype) - nrows = shape[-1] - taker = -1 * np.ones((nrows,), dtype=np.intp) - return missing_arr.take(taker, allow_fill=True, fill_value=dtype.na_value) - elif isinstance(dtype, ExtensionDtype): - # TODO: no tests get here, a handful would if we disabled - # the dt64tz special-case above (which is faster) - cls = dtype.construct_array_type() - missing_arr = cls._empty(shape=shape, dtype=dtype) - missing_arr[:] = dtype.na_value - return missing_arr - else: - # NB: we should never get here with dtype integer or bool; - # if we did, the missing_arr.fill would cast to gibberish - missing_arr = np.empty(shape, dtype=dtype) - fill_value = _dtype_to_na_value(dtype) - missing_arr.fill(fill_value) - return missing_arr - - def _concatenate_join_units( join_units: list[JoinUnit], concat_axis: int, copy: bool ) -> ArrayLike: @@ -466,7 +467,12 @@ def _concatenate_join_units( empty_dtype = _get_empty_dtype(join_units) - to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype) for ju in join_units] + upcasted_na = _dtype_to_na_value(empty_dtype) + + to_concat = [ + ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) + for ju in join_units + ] if len(to_concat) == 1: # Only one block, nothing to concatenate. From 88fb27704ddc85c37b24af25c33a9d39182953a9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jun 2022 20:08:16 +0200 Subject: [PATCH 05/12] Revert "REF: avoid having 0 in JoinUnit.indexers (#43592)" This reverts commit eb643d7dcc71b9d6b85f8a2585d7c99908c1f104. --- pandas/core/internals/concat.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 01797c3366410..128aee9eeafad 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,5 +1,6 @@ from __future__ import annotations +import copy import itertools from typing import ( TYPE_CHECKING, @@ -332,15 +333,12 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra ) ) - if not unit_no_ax0_reindexing: - # create block from subset of columns - # Note: Blocks with only 1 column will always have unit_no_ax0_reindexing, - # so we will never get here with ExtensionBlock. - blk = blk.getitem_block(ax0_blk_indexer) + # Omit indexer if no item reindexing is required. + if unit_no_ax0_reindexing: + join_unit_indexers.pop(0, None) + else: + join_unit_indexers[0] = ax0_blk_indexer - # Assertions disabled for performance - # assert blk._mgr_locs.as_slice == placements.as_slice - # assert blk.shape[0] == shape[0] unit = JoinUnit(blk, shape, join_unit_indexers) plan.append((placements, unit)) @@ -354,7 +352,6 @@ def __init__(self, block: Block, shape: Shape, indexers=None): # Note: block is None implies indexers is None, but not vice-versa if indexers is None: indexers = {} - # we should *never* have `0 in indexers` self.block = block self.indexers = indexers self.shape = shape @@ -606,11 +603,20 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: Extra items that didn't fit are returned as a separate block. """ - assert 0 not in join_unit.indexers - extra_indexers = join_unit.indexers + if 0 not in join_unit.indexers: + extra_indexers = join_unit.indexers + + if join_unit.block is None: + extra_block = None + else: + extra_block = join_unit.block.getitem_block(slice(length, None)) + join_unit.block = join_unit.block.getitem_block(slice(length)) + else: + extra_block = join_unit.block - extra_block = join_unit.block.getitem_block(slice(length, None)) - join_unit.block = join_unit.block.getitem_block(slice(length)) + extra_indexers = copy.copy(join_unit.indexers) + extra_indexers[0] = extra_indexers[0][length:] + join_unit.indexers[0] = join_unit.indexers[0][:length] extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] join_unit.shape = (length,) + join_unit.shape[1:] From 978c1eb3a9fe26c7aea4b0cc0e6918952ae73fb1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jun 2022 20:08:40 +0200 Subject: [PATCH 06/12] Revert "CLN: remove unused concat code (#43577)" This reverts commit 95eb15378462e3c7b731e4357e5e165cdbb58c98. --- pandas/core/dtypes/missing.py | 38 ++++++++++++++++++++ pandas/core/internals/concat.py | 61 +++++++++++++++++++++++++++++---- 2 files changed, 93 insertions(+), 6 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 4316109da1cbb..37b42ad66c027 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -18,6 +18,7 @@ import pandas._libs.missing as libmissing from pandas._libs.tslibs import ( NaT, + Period, iNaT, ) @@ -739,3 +740,40 @@ def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool: # fallback, default to allowing NaN, None, NA, NaT return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal)) + + +def isna_all(arr: ArrayLike) -> bool: + """ + Optimized equivalent to isna(arr).all() + """ + total_len = len(arr) + + # Usually it's enough to check but a small fraction of values to see if + # a block is NOT null, chunks should help in such cases. + # parameters 1000 and 40 were chosen arbitrarily + chunk_len = max(total_len // 40, 1000) + + dtype = arr.dtype + if dtype.kind == "f": + checker = nan_checker + + elif dtype.kind in ["m", "M"] or dtype.type is Period: + # error: Incompatible types in assignment (expression has type + # "Callable[[Any], Any]", variable has type "ufunc") + checker = lambda x: np.asarray(x.view("i8")) == iNaT # type: ignore[assignment] + + else: + # error: Incompatible types in assignment (expression has type "Callable[[Any], + # Any]", variable has type "ufunc") + checker = lambda x: _isna_array( # type: ignore[assignment] + x, inf_as_na=INF_AS_NA + ) + + return all( + # error: Argument 1 to "__call__" of "ufunc" has incompatible type + # "Union[ExtensionArray, Any]"; expected "Union[Union[int, float, complex, str, + # bytes, generic], Sequence[Union[int, float, complex, str, bytes, generic]], + # Sequence[Sequence[Any]], _SupportsArray]" + checker(arr[i : i + chunk_len]).all() # type: ignore[arg-type] + for i in range(0, total_len, chunk_len) + ) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 128aee9eeafad..a7a6e8248e714 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -10,7 +10,11 @@ import numpy as np -from pandas._libs import internals as libinternals +from pandas._libs import ( + NaT, + internals as libinternals, +) +from pandas._libs.missing import NA from pandas._typing import ( ArrayLike, DtypeObj, @@ -27,12 +31,14 @@ is_1d_only_ea_dtype, is_datetime64tz_dtype, is_dtype_equal, + needs_i8_conversion, ) from pandas.core.dtypes.concat import ( cast_to_common_type, concat_compat, ) from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.missing import is_valid_na_for_dtype import pandas.core.algorithms as algos from pandas.core.arrays import ( @@ -378,6 +384,36 @@ def dtype(self): return blk.dtype return ensure_dtype_can_hold_na(blk.dtype) + def _is_valid_na_for(self, dtype: DtypeObj) -> bool: + """ + Check that we are all-NA of a type/dtype that is compatible with this dtype. + Augments `self.is_na` with an additional check of the type of NA values. + """ + if not self.is_na: + return False + if self.block.dtype.kind == "V": + return True + + if self.dtype == object: + values = self.block.values + return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K")) + + na_value = self.block.fill_value + if na_value is NaT and not is_dtype_equal(self.dtype, dtype): + # e.g. we are dt64 and other is td64 + # fill_values match but we should not cast self.block.values to dtype + # TODO: this will need updating if we ever have non-nano dt64/td64 + return False + + if na_value is NA and needs_i8_conversion(dtype): + # FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat + # e.g. self.dtype == "Int64" and dtype is td64, we dont want + # to consider these as matching + return False + + # TODO: better to use can_hold_element? + return is_valid_na_for_dtype(na_value, dtype) + @cache_readonly def is_na(self) -> bool: blk = self.block @@ -388,14 +424,24 @@ def is_na(self) -> bool: def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: values: ArrayLike - if upcasted_na is None and not self.is_na: + if upcasted_na is None and self.block.dtype.kind != "V": # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na - if self.is_na: + if self._is_valid_na_for(empty_dtype): + # note: always holds when self.block.dtype.kind == "V" + blk_dtype = self.block.dtype + + if blk_dtype == np.dtype("object"): + # we want to avoid filling with np.nan if we are + # using None; we already know that we are all + # nulls + values = self.block.values.ravel(order="K") + if len(values) and values[0] is None: + fill_value = None if is_datetime64tz_dtype(empty_dtype): i8values = np.full(self.shape, fill_value.value) @@ -464,7 +510,8 @@ def _concatenate_join_units( empty_dtype = _get_empty_dtype(join_units) - upcasted_na = _dtype_to_na_value(empty_dtype) + has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) + upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) to_concat = [ ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) @@ -506,7 +553,7 @@ def _concatenate_join_units( return concat_values -def _dtype_to_na_value(dtype: DtypeObj): +def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): """ Find the NA value to go with this dtype. """ @@ -544,9 +591,11 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: empty_dtype = join_units[0].block.dtype return empty_dtype - has_none_blocks = any(unit.is_na for unit in join_units) + has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) dtypes = [unit.dtype for unit in join_units if not unit.is_na] + if not len(dtypes): + dtypes = [unit.dtype for unit in join_units if unit.block.dtype.kind != "V"] dtype = find_common_type(dtypes) if has_none_blocks: From cf095e154fc2273f5388844c1753f4f74de4cce8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jun 2022 20:22:19 +0200 Subject: [PATCH 07/12] Partial Revert "BUG/API: concat with empty DataFrames or all-NA columns (#43507)" This reverts commit 084c543bf9e70ed4f2ce1d4115b9959f7ae0c396. --- pandas/core/internals/concat.py | 35 +++++++++++++++++++++-- pandas/tests/frame/methods/test_append.py | 15 +++++++--- pandas/tests/reshape/merge/test_merge.py | 8 ++++-- 3 files changed, 49 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index a7a6e8248e714..c9e26bdd9fd61 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -31,6 +31,7 @@ is_1d_only_ea_dtype, is_datetime64tz_dtype, is_dtype_equal, + is_scalar, needs_i8_conversion, ) from pandas.core.dtypes.concat import ( @@ -38,13 +39,18 @@ concat_compat, ) from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.missing import is_valid_na_for_dtype +from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + isna, + isna_all, +) import pandas.core.algorithms as algos from pandas.core.arrays import ( DatetimeArray, ExtensionArray, ) +from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.internals.array_manager import ( ArrayManager, @@ -419,7 +425,29 @@ def is_na(self) -> bool: blk = self.block if blk.dtype.kind == "V": return True - return False + + if not blk._can_hold_na: + return False + + values = blk.values + if values.size == 0: + return True + if isinstance(values.dtype, SparseDtype): + return False + + if values.ndim == 1: + # TODO(EA2D): no need for special case with 2D EAs + val = values[0] + if not is_scalar(val) or not isna(val): + # ideally isna_all would do this short-circuiting + return False + return isna_all(values) + else: + val = values[0][0] + if not is_scalar(val) or not isna(val): + # ideally isna_all would do this short-circuiting + return False + return all(isna_all(row) for row in values) def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: values: ArrayLike @@ -567,6 +595,9 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): # different from missing.na_value_for_dtype return None elif dtype.kind in ["i", "u"]: + if not has_none_blocks: + # different from missing.na_value_for_dtype + return None return np.nan elif dtype.kind == "O": return np.nan diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index d1c9c379759b5..f07ffee20a55f 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -159,7 +159,7 @@ def test_append_empty_dataframe(self): expected = df1.copy() tm.assert_frame_equal(result, expected) - def test_append_dtypes(self): + def test_append_dtypes(self, using_array_manager): # GH 5754 # row appends of different dtypes (so need to do by-item) @@ -183,7 +183,10 @@ def test_append_dtypes(self): expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) - expected = expected.astype(object) + if using_array_manager: + # TODO(ArrayManager) decide on exact casting rules in concat + # With ArrayManager, all-NaN float is not ignored + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) @@ -192,7 +195,9 @@ def test_append_dtypes(self): expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) - expected = expected.astype(object) + if using_array_manager: + # With ArrayManager, all-NaN float is not ignored + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": np.nan}, index=range(1)) @@ -201,7 +206,9 @@ def test_append_dtypes(self): expected = DataFrame( {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} ) - expected = expected.astype(object) + if using_array_manager: + # With ArrayManager, all-NaN float is not ignored + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ccdfc3cd23790..116fb298df61d 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -682,7 +682,7 @@ def _constructor(self): assert isinstance(result, NotADataFrame) - def test_join_append_timedeltas(self): + def test_join_append_timedeltas(self, using_array_manager): # timedelta64 issues with join/merge # GH 5695 @@ -696,9 +696,11 @@ def test_join_append_timedeltas(self): { "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500), timedelta(0, 22500)], - }, - dtype=object, + } ) + if using_array_manager: + # TODO(ArrayManager) decide on exact casting rules in concat + expected = expected.astype(object) tm.assert_frame_equal(result, expected) def test_join_append_timedeltas2(self): From f02bdb1860a4f97387430909d9a46a9172b03bb1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jun 2022 20:47:17 +0200 Subject: [PATCH 08/12] add test --- pandas/tests/reshape/concat/test_concat.py | 34 ++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index eb44b4889afb8..9fb9a013101b2 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -755,3 +755,37 @@ def test_concat_retain_attrs(data): df2.attrs = {1: 1} df = concat([df1, df2]) assert df.attrs[1] == 1 + + +@pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"]) +@pytest.mark.parametrize("empty_dtype", [None, "float64", "object"]) +def test_concat_ignore_emtpy_object_float(empty_dtype, df_dtype): + # https://github.com/pandas-dev/pandas/issues/45637 + df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype) + empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype) + result = concat([empty, df]) + expected = df + if df_dtype == "int64": + # TODO what exact behaviour do we want for integer eventually? + if empty_dtype == "float64": + expected = df.astype("float64") + else: + expected = df.astype("object") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"]) +@pytest.mark.parametrize("empty_dtype", [None, "float64", "object"]) +def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): + df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype) + empty = DataFrame({"foo": [np.nan], "bar": [np.nan]}, dtype=empty_dtype) + result = concat([empty, df], ignore_index=True) + + if df_dtype == "int64": + # TODO what exact behaviour do we want for integer eventually? + if empty_dtype == "object": + df_dtype = "object" + else: + df_dtype = "float64" + expected = DataFrame({"foo": [None, 1, 2], "bar": [None, 1, 2]}, dtype=df_dtype) + tm.assert_frame_equal(result, expected) From 170931b0e6f50127cc20c25574e30c385cf8b2c5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jun 2022 23:44:10 +0200 Subject: [PATCH 09/12] skip new tests for array manager --- pandas/tests/reshape/concat/test_concat.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 9fb9a013101b2..b91956232d59f 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -15,6 +15,7 @@ InvalidIndexError, PerformanceWarning, ) +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -757,6 +758,7 @@ def test_concat_retain_attrs(data): assert df.attrs[1] == 1 +@td.skip_array_manager_invalid_test @pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"]) @pytest.mark.parametrize("empty_dtype", [None, "float64", "object"]) def test_concat_ignore_emtpy_object_float(empty_dtype, df_dtype): @@ -774,6 +776,7 @@ def test_concat_ignore_emtpy_object_float(empty_dtype, df_dtype): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_invalid_test @pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"]) @pytest.mark.parametrize("empty_dtype", [None, "float64", "object"]) def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): From 5ed7dad15228ae556f87354a5b1f706d4c34df53 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jun 2022 23:46:01 +0200 Subject: [PATCH 10/12] fix typing --- pandas/core/internals/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index c9e26bdd9fd61..4a352d614e1d9 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -507,7 +507,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if self.block.is_bool: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. - values = self.block.astype(np.object_).values + values = self.block.astype(np.dtype("object")).values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. From 1c718bbc7f34769f6fd3210054a603d39e91981a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Jun 2022 15:40:53 +0200 Subject: [PATCH 11/12] add more tests --- pandas/tests/extension/base/setitem.py | 14 ++++++++++++++ pandas/tests/reshape/concat/test_concat.py | 10 ++++++++++ 2 files changed, 24 insertions(+) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 9e016e0101ef6..04fa3c11a6c40 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -357,6 +357,20 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): self.assert_frame_equal(result, expected) + def test_setitem_with_expansion_row(self, data, na_value): + df = pd.DataFrame({"data": data[:1]}) + + df.loc[1, "data"] = data[1] + expected = pd.DataFrame({"data": data[:2]}) + self.assert_frame_equal(df, expected) + + # https://github.com/pandas-dev/pandas/issues/47284 + df.loc[2, "data"] = na_value + expected = pd.DataFrame( + {"data": pd.Series([data[0], data[1], na_value], dtype=data.dtype)} + ) + self.assert_frame_equal(df, expected) + def test_setitem_series(self, data, full_indexer): # https://github.com/pandas-dev/pandas/issues/32395 ser = pd.Series(data, name="data") diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 0819da0a23d5e..f8b93ac367c87 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -792,3 +792,13 @@ def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): df_dtype = "float64" expected = DataFrame({"foo": [None, 1, 2], "bar": [None, 1, 2]}, dtype=df_dtype) tm.assert_frame_equal(result, expected) + + +def test_concat_ignore_empty_from_reindex(): + # https://github.com/pandas-dev/pandas/pull/43507#issuecomment-920375856 + df1 = DataFrame({"a": [1], "b": [pd.Timestamp("2012-01-01")]}) + df2 = DataFrame({"a": [2]}) + + result = concat([df1, df2.reindex(columns=df1.columns)], ignore_index=True) + expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]}) + tm.assert_frame_equal(result, expected) From 421ec5dc5c53711218cc0362e428411c7b944069 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Jun 2022 18:40:08 +0200 Subject: [PATCH 12/12] add whatsnew --- doc/source/whatsnew/v1.4.0.rst | 13 +++++++++++-- doc/source/whatsnew/v1.4.3.rst | 11 +++++++++++ pandas/tests/reshape/concat/test_concat.py | 1 + 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 52aa9312d4c14..697070e50a40a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -271,6 +271,9 @@ the given ``dayfirst`` value when the value is a delimited date string (e.g. Ignoring dtypes in concat with empty or all-NA columns ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. note:: + This behaviour change has been reverted in pandas 1.4.3. + When using :func:`concat` to concatenate two or more :class:`DataFrame` objects, if one of the DataFrames was empty or had all-NA values, its dtype was *sometimes* ignored when finding the concatenated dtype. These are now @@ -301,9 +304,15 @@ object, the ``np.nan`` is retained. *New behavior*: -.. ipython:: python +.. code-block:: ipython + + In [4]: res + Out[4]: + bar + 0 2013-01-01 00:00:00 + 1 NaN + - res .. _whatsnew_140.notable_bug_fixes.value_counts_and_mode_do_not_coerce_to_nan: diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index a4d81533df23d..0f740f845119a 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -10,6 +10,17 @@ including other versions of pandas. .. --------------------------------------------------------------------------- +.. _whatsnew_143.concat: + +Behaviour of ``concat`` with empty or all-NA DataFrame columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The behaviour change in version 1.4.0 to stop ignoring the data type +of empty or all-NA columns with float or object dtype in :func:`concat` +(:ref:`whatsnew_140.notable_bug_fixes.concat_with_empty_or_all_na`) has been +reverted (:issue:`45637`). + + .. _whatsnew_143.regressions: Fixed regressions diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index f8b93ac367c87..4ba231523af14 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -794,6 +794,7 @@ def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_invalid_test def test_concat_ignore_empty_from_reindex(): # https://github.com/pandas-dev/pandas/pull/43507#issuecomment-920375856 df1 = DataFrame({"a": [1], "b": [pd.Timestamp("2012-01-01")]})