From e7bfa92b90b354510ab80213cfb95c13096cf4df Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 3 Sep 2021 10:11:17 -0700 Subject: [PATCH 1/4] REF: remove no-longer reachable cases from internals.concat --- pandas/core/internals/concat.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 1360f1d1a508a..9f9c68d437e97 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,6 +1,5 @@ from __future__ import annotations -import copy import itertools from typing import ( TYPE_CHECKING, @@ -648,20 +647,15 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: Extra items that didn't fit are returned as a separate block. """ - if 0 not in join_unit.indexers: - extra_indexers = join_unit.indexers + assert 0 not in join_unit.indexers - if join_unit.block is None: - extra_block = None - else: - extra_block = join_unit.block.getitem_block(slice(length, None)) - join_unit.block = join_unit.block.getitem_block(slice(length)) - else: - extra_block = join_unit.block + extra_indexers = join_unit.indexers - extra_indexers = copy.copy(join_unit.indexers) - extra_indexers[0] = extra_indexers[0][length:] - join_unit.indexers[0] = join_unit.indexers[0][:length] + if join_unit.block is None: + extra_block = None + else: + extra_block = join_unit.block.getitem_block(slice(length, None)) + join_unit.block = join_unit.block.getitem_block(slice(length)) extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] join_unit.shape = (length,) + join_unit.shape[1:] From 91c310aef3632ccac15172e12658297eeb53a96d Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 3 Sep 2021 16:25:05 -0700 Subject: [PATCH 2/4] fix incorrect assertion --- pandas/core/internals/concat.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 9f9c68d437e97..b2c5f6fbc308c 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,5 +1,6 @@ from __future__ import annotations +import copy import itertools from typing import ( TYPE_CHECKING, @@ -647,9 +648,20 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: Extra items that didn't fit are returned as a separate block. """ - assert 0 not in join_unit.indexers + if 0 not in join_unit.indexers: + extra_indexers = join_unit.indexers - extra_indexers = join_unit.indexers + if join_unit.block is None: + extra_block = None + else: + extra_block = join_unit.block.getitem_block(slice(length, None)) + join_unit.block = join_unit.block.getitem_block(slice(length)) + else: + extra_block = join_unit.block + + extra_indexers = copy.copy(join_unit.indexers) + extra_indexers[0] = extra_indexers[0][length:] + join_unit.indexers[0] = join_unit.indexers[0][:length] if join_unit.block is None: extra_block = None From 995f296a32b5db66ca738af1cda7d16f3e654c83 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 4 Sep 2021 13:08:56 -0700 Subject: [PATCH 3/4] typo fixup --- pandas/core/internals/concat.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index b2c5f6fbc308c..1360f1d1a508a 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -663,12 +663,6 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: extra_indexers[0] = extra_indexers[0][length:] join_unit.indexers[0] = join_unit.indexers[0][:length] - if join_unit.block is None: - extra_block = None - else: - extra_block = join_unit.block.getitem_block(slice(length, None)) - join_unit.block = join_unit.block.getitem_block(slice(length)) - extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] join_unit.shape = (length,) + join_unit.shape[1:] From 423f46e739c3ed728e6a0d43bd2ecbcaae2f81da Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 14 Sep 2021 20:21:11 -0700 Subject: [PATCH 4/4] CLN: removed unused concat code --- pandas/core/dtypes/missing.py | 38 -------------------- pandas/core/internals/concat.py | 61 ++++----------------------------- 2 files changed, 6 insertions(+), 93 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 0e342c7855653..f5fbd4cc4a7fc 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -12,7 +12,6 @@ import pandas._libs.missing as libmissing from pandas._libs.tslibs import ( NaT, - Period, iNaT, ) from pandas._typing import ( @@ -644,40 +643,3 @@ def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool: # fallback, default to allowing NaN, None, NA, NaT return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal)) - - -def isna_all(arr: ArrayLike) -> bool: - """ - Optimized equivalent to isna(arr).all() - """ - total_len = len(arr) - - # Usually it's enough to check but a small fraction of values to see if - # a block is NOT null, chunks should help in such cases. - # parameters 1000 and 40 were chosen arbitrarily - chunk_len = max(total_len // 40, 1000) - - dtype = arr.dtype - if dtype.kind == "f": - checker = nan_checker - - elif dtype.kind in ["m", "M"] or dtype.type is Period: - # error: Incompatible types in assignment (expression has type - # "Callable[[Any], Any]", variable has type "ufunc") - checker = lambda x: np.asarray(x.view("i8")) == iNaT # type: ignore[assignment] - - else: - # error: Incompatible types in assignment (expression has type "Callable[[Any], - # Any]", variable has type "ufunc") - checker = lambda x: _isna_array( # type: ignore[assignment] - x, inf_as_na=INF_AS_NA - ) - - return all( - # error: Argument 1 to "__call__" of "ufunc" has incompatible type - # "Union[ExtensionArray, Any]"; expected "Union[Union[int, float, complex, str, - # bytes, generic], Sequence[Union[int, float, complex, str, bytes, generic]], - # Sequence[Sequence[Any]], _SupportsArray]" - checker(arr[i : i + chunk_len]).all() # type: ignore[arg-type] - for i in range(0, total_len, chunk_len) - ) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 1360f1d1a508a..69b01f0b26be3 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -10,11 +10,7 @@ import numpy as np -from pandas._libs import ( - NaT, - internals as libinternals, -) -from pandas._libs.missing import NA +from pandas._libs import internals as libinternals from pandas._typing import ( ArrayLike, DtypeObj, @@ -32,14 +28,12 @@ is_1d_only_ea_obj, is_datetime64tz_dtype, is_dtype_equal, - needs_i8_conversion, ) from pandas.core.dtypes.concat import ( cast_to_common_type, concat_compat, ) from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.missing import is_valid_na_for_dtype import pandas.core.algorithms as algos from pandas.core.arrays import ( @@ -381,36 +375,6 @@ def dtype(self): return blk.dtype return ensure_dtype_can_hold_na(blk.dtype) - def _is_valid_na_for(self, dtype: DtypeObj) -> bool: - """ - Check that we are all-NA of a type/dtype that is compatible with this dtype. - Augments `self.is_na` with an additional check of the type of NA values. - """ - if not self.is_na: - return False - if self.block.dtype.kind == "V": - return True - - if self.dtype == object: - values = self.block.values - return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K")) - - na_value = self.block.fill_value - if na_value is NaT and not is_dtype_equal(self.dtype, dtype): - # e.g. we are dt64 and other is td64 - # fill_values match but we should not cast self.block.values to dtype - # TODO: this will need updating if we ever have non-nano dt64/td64 - return False - - if na_value is NA and needs_i8_conversion(dtype): - # FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat - # e.g. self.dtype == "Int64" and dtype is td64, we dont want - # to consider these as matching - return False - - # TODO: better to use can_hold_element? - return is_valid_na_for_dtype(na_value, dtype) - @cache_readonly def is_na(self) -> bool: blk = self.block @@ -421,24 +385,14 @@ def is_na(self) -> bool: def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: values: ArrayLike - if upcasted_na is None and self.block.dtype.kind != "V": + if upcasted_na is None and not self.is_na: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na - if self._is_valid_na_for(empty_dtype): - # note: always holds when self.block.dtype.kind == "V" - blk_dtype = self.block.dtype - - if blk_dtype == np.dtype("object"): - # we want to avoid filling with np.nan if we are - # using None; we already know that we are all - # nulls - values = self.block.values.ravel(order="K") - if len(values) and values[0] is None: - fill_value = None + if self.is_na: if is_datetime64tz_dtype(empty_dtype): i8values = np.full(self.shape, fill_value.value) @@ -507,8 +461,7 @@ def _concatenate_join_units( empty_dtype = _get_empty_dtype(join_units) - has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) - upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) + upcasted_na = _dtype_to_na_value(empty_dtype) to_concat = [ ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) @@ -548,7 +501,7 @@ def _concatenate_join_units( return concat_values -def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): +def _dtype_to_na_value(dtype: DtypeObj): """ Find the NA value to go with this dtype. """ @@ -587,11 +540,9 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: empty_dtype = join_units[0].block.dtype return empty_dtype - has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) + has_none_blocks = any(unit.is_na for unit in join_units) dtypes = [unit.dtype for unit in join_units if not unit.is_na] - if not len(dtypes): - dtypes = [unit.dtype for unit in join_units if unit.block.dtype.kind != "V"] dtype = find_common_type(dtypes) if has_none_blocks: