Skip to content

Commit 4bb4b52

Browse files
authored
REF: implement make_na_array (pandas-dev#43606)
1 parent 604fffb commit 4bb4b52

File tree

1 file changed

+46
-52
lines changed

1 file changed

+46
-52
lines changed

pandas/core/internals/concat.py

+46-52
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@
99

1010
import numpy as np
1111

12-
from pandas._libs import internals as libinternals
12+
from pandas._libs import (
13+
NaT,
14+
internals as libinternals,
15+
)
1316
from pandas._typing import (
1417
ArrayLike,
1518
DtypeObj,
@@ -383,59 +386,21 @@ def is_na(self) -> bool:
383386
return True
384387
return False
385388

386-
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
389+
def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike:
387390
values: ArrayLike
388391

389-
if upcasted_na is None and not self.is_na:
390-
# No upcasting is necessary
391-
fill_value = self.block.fill_value
392-
values = self.block.get_values()
392+
if self.is_na:
393+
return make_na_array(empty_dtype, self.shape)
394+
393395
else:
394-
fill_value = upcasted_na
395-
396-
if self.is_na:
397-
398-
if is_datetime64tz_dtype(empty_dtype):
399-
i8values = np.full(self.shape, fill_value.value)
400-
return DatetimeArray(i8values, dtype=empty_dtype)
401-
402-
elif is_1d_only_ea_dtype(empty_dtype):
403-
empty_dtype = cast(ExtensionDtype, empty_dtype)
404-
cls = empty_dtype.construct_array_type()
405-
406-
missing_arr = cls._from_sequence([], dtype=empty_dtype)
407-
ncols, nrows = self.shape
408-
assert ncols == 1, ncols
409-
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
410-
return missing_arr.take(
411-
empty_arr, allow_fill=True, fill_value=fill_value
412-
)
413-
elif isinstance(empty_dtype, ExtensionDtype):
414-
# TODO: no tests get here, a handful would if we disabled
415-
# the dt64tz special-case above (which is faster)
416-
cls = empty_dtype.construct_array_type()
417-
missing_arr = cls._empty(shape=self.shape, dtype=empty_dtype)
418-
missing_arr[:] = fill_value
419-
return missing_arr
420-
else:
421-
# NB: we should never get here with empty_dtype integer or bool;
422-
# if we did, the missing_arr.fill would cast to gibberish
423-
missing_arr = np.empty(self.shape, dtype=empty_dtype)
424-
missing_arr.fill(fill_value)
425-
return missing_arr
426396

427397
if (not self.indexers) and (not self.block._can_consolidate):
428398
# preserve these for validation in concat_compat
429399
return self.block.values
430400

431-
if self.block.is_bool:
432-
# External code requested filling/upcasting, bool values must
433-
# be upcasted to object to avoid being upcasted to numeric.
434-
values = self.block.astype(np.object_).values
435-
else:
436-
# No dtype upcasting is done here, it will be performed during
437-
# concatenation itself.
438-
values = self.block.values
401+
# No dtype upcasting is done here, it will be performed during
402+
# concatenation itself.
403+
values = self.block.values
439404

440405
if not self.indexers:
441406
# If there's no indexing to be done, we want to signal outside
@@ -450,6 +415,40 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
450415
return values
451416

452417

418+
def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike:
419+
"""
420+
Construct an np.ndarray or ExtensionArray of the given dtype and shape
421+
holding all-NA values.
422+
"""
423+
if is_datetime64tz_dtype(dtype):
424+
# NaT here is analogous to dtype.na_value below
425+
i8values = np.full(shape, NaT.value)
426+
return DatetimeArray(i8values, dtype=dtype)
427+
428+
elif is_1d_only_ea_dtype(dtype):
429+
dtype = cast(ExtensionDtype, dtype)
430+
cls = dtype.construct_array_type()
431+
432+
missing_arr = cls._from_sequence([], dtype=dtype)
433+
nrows = shape[-1]
434+
taker = -1 * np.ones((nrows,), dtype=np.intp)
435+
return missing_arr.take(taker, allow_fill=True, fill_value=dtype.na_value)
436+
elif isinstance(dtype, ExtensionDtype):
437+
# TODO: no tests get here, a handful would if we disabled
438+
# the dt64tz special-case above (which is faster)
439+
cls = dtype.construct_array_type()
440+
missing_arr = cls._empty(shape=shape, dtype=dtype)
441+
missing_arr[:] = dtype.na_value
442+
return missing_arr
443+
else:
444+
# NB: we should never get here with dtype integer or bool;
445+
# if we did, the missing_arr.fill would cast to gibberish
446+
missing_arr = np.empty(shape, dtype=dtype)
447+
fill_value = _dtype_to_na_value(dtype)
448+
missing_arr.fill(fill_value)
449+
return missing_arr
450+
451+
453452
def _concatenate_join_units(
454453
join_units: list[JoinUnit], concat_axis: int, copy: bool
455454
) -> ArrayLike:
@@ -462,12 +461,7 @@ def _concatenate_join_units(
462461

463462
empty_dtype = _get_empty_dtype(join_units)
464463

465-
upcasted_na = _dtype_to_na_value(empty_dtype)
466-
467-
to_concat = [
468-
ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na)
469-
for ju in join_units
470-
]
464+
to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype) for ju in join_units]
471465

472466
if len(to_concat) == 1:
473467
# Only one block, nothing to concatenate.

0 commit comments

Comments
 (0)