From 0dcaadd52787b235ffd1455ea10ef8657c904ee5 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 15 Apr 2021 18:38:59 -0700 Subject: [PATCH] REF: move DataFrame-specific methods from NDFrame --- pandas/core/frame.py | 101 +++++++ pandas/core/generic.py | 94 +----- pandas/core/groupby/generic.py | 1 + pandas/core/internals/array_manager.py | 244 ++++++++-------- pandas/core/internals/managers.py | 346 +++++++++++------------ pandas/tests/internals/test_internals.py | 24 +- 6 files changed, 415 insertions(+), 395 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8ac94111cca56..4469869a5a929 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10454,6 +10454,107 @@ def _AXIS_NAMES(self) -> dict[int, str]: boxplot = pandas.plotting.boxplot_frame sparse = CachedAccessor("sparse", SparseFrameAccessor) + # ---------------------------------------------------------------------- + # Internal Interface Methods + + def _to_dict_of_blocks(self, copy: bool = True): + """ + Return a dict of dtype -> Constructor Types that + each is a homogeneous dtype. + + Internal ONLY - only works for BlockManager + """ + mgr = self._mgr + # convert to BlockManager if needed -> this way support ArrayManager as well + mgr = mgr_to_mgr(mgr, "block") + mgr = cast(BlockManager, mgr) + return { + k: self._constructor(v).__finalize__(self) + for k, v, in mgr.to_dict(copy=copy).items() + } + + @property + def values(self) -> np.ndarray: + """ + Return a Numpy representation of the DataFrame. + + .. warning:: + + We recommend using :meth:`DataFrame.to_numpy` instead. + + Only the values in the DataFrame will be returned, the axes labels + will be removed. + + Returns + ------- + numpy.ndarray + The values of the DataFrame. + + See Also + -------- + DataFrame.to_numpy : Recommended alternative to this method. + DataFrame.index : Retrieve the index labels. + DataFrame.columns : Retrieving the column names. + + Notes + ----- + The dtype will be a lower-common-denominator dtype (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. Use this + with care if you are not dealing with the blocks. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. If dtypes are int32 and uint8, dtype will be upcast to + int32. By :func:`numpy.find_common_type` convention, mixing int64 + and uint64 will result in a float64 dtype. + + Examples + -------- + A DataFrame where all columns are the same type (e.g., int64) results + in an array of the same type. + + >>> df = pd.DataFrame({'age': [ 3, 29], + ... 'height': [94, 170], + ... 'weight': [31, 115]}) + >>> df + age height weight + 0 3 94 31 + 1 29 170 115 + >>> df.dtypes + age int64 + height int64 + weight int64 + dtype: object + >>> df.values + array([[ 3, 94, 31], + [ 29, 170, 115]]) + + A DataFrame with mixed type columns(e.g., str/object, int64, float32) + results in an ndarray of the broadest type that accommodates these + mixed types (e.g., object). + + >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), + ... ('lion', 80.5, 1), + ... ('monkey', np.nan, None)], + ... columns=('name', 'max_speed', 'rank')) + >>> df2.dtypes + name object + max_speed float64 + rank object + dtype: object + >>> df2.values + array([['parrot', 24.0, 'second'], + ['lion', 80.5, 1], + ['monkey', nan, None]], dtype=object) + """ + self._consolidate_inplace() + return self._mgr.as_array(transpose=True) + + @property + def _values(self) -> np.ndarray: + """internal implementation""" + return self.values + DataFrame._add_numeric_operations() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eba4a36315ba4..35f29bdedf639 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5614,85 +5614,12 @@ def _get_bool_data(self): @property def values(self) -> np.ndarray: - """ - Return a Numpy representation of the DataFrame. - - .. warning:: - - We recommend using :meth:`DataFrame.to_numpy` instead. - - Only the values in the DataFrame will be returned, the axes labels - will be removed. - - Returns - ------- - numpy.ndarray - The values of the DataFrame. - - See Also - -------- - DataFrame.to_numpy : Recommended alternative to this method. - DataFrame.index : Retrieve the index labels. - DataFrame.columns : Retrieving the column names. - - Notes - ----- - The dtype will be a lower-common-denominator dtype (implicit - upcasting); that is to say if the dtypes (even of numeric types) - are mixed, the one that accommodates all will be chosen. Use this - with care if you are not dealing with the blocks. - - e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. If dtypes are int32 and uint8, dtype will be upcast to - int32. By :func:`numpy.find_common_type` convention, mixing int64 - and uint64 will result in a float64 dtype. - - Examples - -------- - A DataFrame where all columns are the same type (e.g., int64) results - in an array of the same type. - - >>> df = pd.DataFrame({'age': [ 3, 29], - ... 'height': [94, 170], - ... 'weight': [31, 115]}) - >>> df - age height weight - 0 3 94 31 - 1 29 170 115 - >>> df.dtypes - age int64 - height int64 - weight int64 - dtype: object - >>> df.values - array([[ 3, 94, 31], - [ 29, 170, 115]]) - - A DataFrame with mixed type columns(e.g., str/object, int64, float32) - results in an ndarray of the broadest type that accommodates these - mixed types (e.g., object). - - >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), - ... ('lion', 80.5, 1), - ... ('monkey', np.nan, None)], - ... columns=('name', 'max_speed', 'rank')) - >>> df2.dtypes - name object - max_speed float64 - rank object - dtype: object - >>> df2.values - array([['parrot', 24.0, 'second'], - ['lion', 80.5, 1], - ['monkey', nan, None]], dtype=object) - """ - self._consolidate_inplace() - return self._mgr.as_array(transpose=self._AXIS_REVERSED) + raise AbstractMethodError(self) @property def _values(self) -> np.ndarray: """internal implementation""" - return self.values + raise AbstractMethodError(self) @property def dtypes(self): @@ -5725,23 +5652,6 @@ def dtypes(self): data = self._mgr.get_dtypes() return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_) - @final - def _to_dict_of_blocks(self, copy: bool_t = True): - """ - Return a dict of dtype -> Constructor Types that - each is a homogeneous dtype. - - Internal ONLY - only works for BlockManager - """ - mgr = self._mgr - # convert to BlockManager if needed -> this way support ArrayManager as well - mgr = mgr_to_mgr(mgr, "block") - mgr = cast(BlockManager, mgr) - return { - k: self._constructor(v).__finalize__(self) - for k, v, in mgr.to_dict(copy=copy).items() - } - def astype( self: FrameOrSeries, dtype, copy: bool_t = True, errors: str = "raise" ) -> FrameOrSeries: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1a8329d77f61e..4559b2d8e5e78 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1152,6 +1152,7 @@ def py_fallback(values: ArrayLike) -> ArrayLike: # We've split an object block! Everything we've assumed # about a single block input returning a single block output # is a lie. See eg GH-39329 + mgr = cast(Manager2D, mgr) return mgr.as_array() else: # We are a single block from a BlockManager diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index a25750e7e1eab..ff76228646a02 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -201,47 +201,6 @@ def __repr__(self) -> str: output += f"\n{arr.dtype}" return output - def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: - """ - Apply grouped reduction function columnwise, returning a new ArrayManager. - - Parameters - ---------- - func : grouped reduction function - ignore_failures : bool, default False - Whether to drop columns where func raises TypeError. - - Returns - ------- - ArrayManager - """ - result_arrays: list[np.ndarray] = [] - result_indices: list[int] = [] - - for i, arr in enumerate(self.arrays): - try: - res = func(arr) - except (TypeError, NotImplementedError): - if not ignore_failures: - raise - continue - result_arrays.append(res) - result_indices.append(i) - - if len(result_arrays) == 0: - index = Index([None]) # placeholder - else: - index = Index(range(result_arrays[0].shape[0])) - - if ignore_failures: - columns = self.items[np.array(result_indices, dtype="int64")] - else: - columns = self.items - - # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]"; - # expected "List[Union[ndarray, ExtensionArray]]" - return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type] - def apply( self: T, f, @@ -322,25 +281,6 @@ def apply( # expected "List[Union[ndarray, ExtensionArray]]" return type(self)(result_arrays, new_axes) # type: ignore[arg-type] - def apply_2d(self: T, f, ignore_failures: bool = False, **kwargs) -> T: - """ - Variant of `apply`, but where the function should not be applied to - each column independently, but to the full data as a 2D array. - """ - values = self.as_array() - try: - result = f(values, **kwargs) - except (TypeError, NotImplementedError): - if not ignore_failures: - raise - result_arrays = [] - new_axes = [self._axes[0], self.axes[1].take([])] - else: - result_arrays = [result[:, i] for i in range(len(self._axes[1]))] - new_axes = self._axes - - return type(self)(result_arrays, new_axes) - def apply_with_block(self: T, f, align_keys=None, swap_axis=True, **kwargs) -> T: # switch axis to follow BlockManager logic if swap_axis and "axis" in kwargs and self.ndim == 2: @@ -606,67 +546,6 @@ def copy_func(ax): new_arrays = self.arrays return type(self)(new_arrays, new_axes) - def as_array( - self, - transpose: bool = False, - dtype=None, - copy: bool = False, - na_value=lib.no_default, - ) -> np.ndarray: - """ - Convert the blockmanager data into an numpy array. - - Parameters - ---------- - transpose : bool, default False - If True, transpose the return array. - dtype : object, default None - Data type of the return array. - copy : bool, default False - If True then guarantee that a copy is returned. A value of - False does not guarantee that the underlying data is not - copied. - na_value : object, default lib.no_default - Value to be used as the missing value sentinel. - - Returns - ------- - arr : ndarray - """ - if len(self.arrays) == 0: - arr = np.empty(self.shape, dtype=float) - return arr.transpose() if transpose else arr - - # We want to copy when na_value is provided to avoid - # mutating the original object - copy = copy or na_value is not lib.no_default - - if not dtype: - dtype = interleaved_dtype([arr.dtype for arr in self.arrays]) - - if isinstance(dtype, SparseDtype): - dtype = dtype.subtype - elif isinstance(dtype, PandasDtype): - dtype = dtype.numpy_dtype - elif is_extension_array_dtype(dtype): - dtype = "object" - elif is_dtype_equal(dtype, str): - dtype = "object" - - result = np.empty(self.shape_proper, dtype=dtype) - - # error: Incompatible types in assignment (expression has type "Union[ndarray, - # ExtensionArray]", variable has type "ndarray") - for i, arr in enumerate(self.arrays): # type: ignore[assignment] - arr = arr.astype(dtype, copy=copy) - result[:, i] = arr - - if na_value is not lib.no_default: - result[isna(result)] = na_value - - return result - # return arr.transpose() if transpose else arr - def reindex_indexer( self: T, new_axis, @@ -1035,6 +914,47 @@ def idelete(self, indexer): # -------------------------------------------------------------------- # Array-wise Operation + def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: + """ + Apply grouped reduction function columnwise, returning a new ArrayManager. + + Parameters + ---------- + func : grouped reduction function + ignore_failures : bool, default False + Whether to drop columns where func raises TypeError. + + Returns + ------- + ArrayManager + """ + result_arrays: list[np.ndarray] = [] + result_indices: list[int] = [] + + for i, arr in enumerate(self.arrays): + try: + res = func(arr) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue + result_arrays.append(res) + result_indices.append(i) + + if len(result_arrays) == 0: + index = Index([None]) # placeholder + else: + index = Index(range(result_arrays[0].shape[0])) + + if ignore_failures: + columns = self.items[np.array(result_indices, dtype="int64")] + else: + columns = self.items + + # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]"; + # expected "List[Union[ndarray, ExtensionArray]]" + return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type] + def reduce( self: T, func: Callable, ignore_failures: bool = False ) -> tuple[T, np.ndarray]: @@ -1122,6 +1042,27 @@ def quantile( axes = [qs, self._axes[1]] return type(self)(new_arrs, axes) + def apply_2d( + self: ArrayManager, f, ignore_failures: bool = False, **kwargs + ) -> ArrayManager: + """ + Variant of `apply`, but where the function should not be applied to + each column independently, but to the full data as a 2D array. + """ + values = self.as_array() + try: + result = f(values, **kwargs) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + result_arrays = [] + new_axes = [self._axes[0], self.axes[1].take([])] + else: + result_arrays = [result[:, i] for i in range(len(self._axes[1]))] + new_axes = self._axes + + return type(self)(result_arrays, new_axes) + # ---------------------------------------------------------------- def unstack(self, unstacker, fill_value) -> ArrayManager: @@ -1166,6 +1107,67 @@ def unstack(self, unstacker, fill_value) -> ArrayManager: return type(self)(new_arrays, new_axes, verify_integrity=False) + def as_array( + self, + transpose: bool = False, + dtype=None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. + + Parameters + ---------- + transpose : bool, default False + If True, transpose the return array. + dtype : object, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. + + Returns + ------- + arr : ndarray + """ + if len(self.arrays) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr + + # We want to copy when na_value is provided to avoid + # mutating the original object + copy = copy or na_value is not lib.no_default + + if not dtype: + dtype = interleaved_dtype([arr.dtype for arr in self.arrays]) + + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + elif isinstance(dtype, PandasDtype): + dtype = dtype.numpy_dtype + elif is_extension_array_dtype(dtype): + dtype = "object" + elif is_dtype_equal(dtype, str): + dtype = "object" + + result = np.empty(self.shape_proper, dtype=dtype) + + # error: Incompatible types in assignment (expression has type "Union[ndarray, + # ExtensionArray]", variable has type "ndarray") + for i, arr in enumerate(self.arrays): # type: ignore[assignment] + arr = arr.astype(dtype, copy=copy) + result[:, i] = arr + + if na_value is not lib.no_default: + result[isna(result)] = na_value + + return result + # return arr.transpose() if transpose else arr + class SingleArrayManager(BaseArrayManager, SingleDataManager): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 487047f1a1dbb..cd230a29250d2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -285,41 +285,6 @@ def __repr__(self) -> str: output += f"\n{block}" return output - def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: - """ - Apply grouped reduction function blockwise, returning a new BlockManager. - - Parameters - ---------- - func : grouped reduction function - ignore_failures : bool, default False - Whether to drop blocks where func raises TypeError. - - Returns - ------- - BlockManager - """ - result_blocks: list[Block] = [] - - for blk in self.blocks: - try: - applied = blk.apply(func) - except (TypeError, NotImplementedError): - if not ignore_failures: - raise - continue - result_blocks = extend_blocks(applied, result_blocks) - - if len(result_blocks) == 0: - index = Index([None]) # placeholder - else: - index = Index(range(result_blocks[0].values.shape[-1])) - - if ignore_failures: - return self._combine(result_blocks, index=index) - - return type(self).from_blocks(result_blocks, [self.axes[0], index]) - def apply( self: T, f, @@ -620,144 +585,6 @@ def copy_func(ax): res.axes = new_axes return res - def as_array( - self, - transpose: bool = False, - dtype: Dtype | None = None, - copy: bool = False, - na_value=lib.no_default, - ) -> np.ndarray: - """ - Convert the blockmanager data into an numpy array. - - Parameters - ---------- - transpose : bool, default False - If True, transpose the return array. - dtype : object, default None - Data type of the return array. - copy : bool, default False - If True then guarantee that a copy is returned. A value of - False does not guarantee that the underlying data is not - copied. - na_value : object, default lib.no_default - Value to be used as the missing value sentinel. - - Returns - ------- - arr : ndarray - """ - if len(self.blocks) == 0: - arr = np.empty(self.shape, dtype=float) - return arr.transpose() if transpose else arr - - # We want to copy when na_value is provided to avoid - # mutating the original object - copy = copy or na_value is not lib.no_default - - if self.is_single_block: - blk = self.blocks[0] - if blk.is_extension: - # Avoid implicit conversion of extension blocks to object - - # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no - # attribute "to_numpy" - arr = blk.values.to_numpy( # type: ignore[union-attr] - dtype=dtype, na_value=na_value - ).reshape(blk.shape) - else: - arr = np.asarray(blk.get_values()) - if dtype: - # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has - # incompatible type "Union[ExtensionDtype, str, dtype[Any], - # Type[object]]"; expected "Union[dtype[Any], None, type, - # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, - # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" - arr = arr.astype(dtype, copy=False) # type: ignore[arg-type] - else: - arr = self._interleave(dtype=dtype, na_value=na_value) - # The underlying data was copied within _interleave - copy = False - - if copy: - arr = arr.copy() - - if na_value is not lib.no_default: - arr[isna(arr)] = na_value - - return arr.transpose() if transpose else arr - - def _interleave( - self, dtype: Dtype | None = None, na_value=lib.no_default - ) -> np.ndarray: - """ - Return ndarray from blocks with specified item order - Items must be contained in the blocks - """ - if not dtype: - dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) - - # TODO: https://github.com/pandas-dev/pandas/issues/22791 - # Give EAs some input on what happens here. Sparse needs this. - if isinstance(dtype, SparseDtype): - dtype = dtype.subtype - elif is_extension_array_dtype(dtype): - dtype = "object" - elif is_dtype_equal(dtype, str): - dtype = "object" - - # error: Argument "dtype" to "empty" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object], None]"; expected - # "Union[dtype[Any], None, type, _SupportsDType, str, Union[Tuple[Any, int], - # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, - # Any]]]" - result = np.empty(self.shape, dtype=dtype) # type: ignore[arg-type] - - itemmask = np.zeros(self.shape[0]) - - for blk in self.blocks: - rl = blk.mgr_locs - if blk.is_extension: - # Avoid implicit conversion of extension blocks to object - - # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no - # attribute "to_numpy" - arr = blk.values.to_numpy( # type: ignore[union-attr] - dtype=dtype, na_value=na_value - ) - else: - # error: Argument 1 to "get_values" of "Block" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object], None]"; expected - # "Union[dtype[Any], ExtensionDtype, None]" - arr = blk.get_values(dtype) # type: ignore[arg-type] - result[rl.indexer] = arr - itemmask[rl.indexer] = 1 - - if not itemmask.all(): - raise AssertionError("Some items were not contained in blocks") - - return result - - def to_dict(self, copy: bool = True): - """ - Return a dict of str(dtype) -> BlockManager - - Parameters - ---------- - copy : bool, default True - - Returns - ------- - values : a dict of dtype -> BlockManager - """ - - bd: dict[str, list[Block]] = {} - for b in self.blocks: - bd.setdefault(str(b.dtype), []).append(b) - - # TODO(EA2D): the combine will be unnecessary with 2D EAs - return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()} - def consolidate(self: T) -> T: """ Join together blocks having same dtype @@ -1368,6 +1195,41 @@ def idelete(self, indexer) -> BlockManager: # ---------------------------------------------------------------- # Block-wise Operation + def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: + """ + Apply grouped reduction function blockwise, returning a new BlockManager. + + Parameters + ---------- + func : grouped reduction function + ignore_failures : bool, default False + Whether to drop blocks where func raises TypeError. + + Returns + ------- + BlockManager + """ + result_blocks: list[Block] = [] + + for blk in self.blocks: + try: + applied = blk.apply(func) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue + result_blocks = extend_blocks(applied, result_blocks) + + if len(result_blocks) == 0: + index = Index([None]) # placeholder + else: + index = Index(range(result_blocks[0].values.shape[-1])) + + if ignore_failures: + return self._combine(result_blocks, index=index) + + return type(self).from_blocks(result_blocks, [self.axes[0], index]) + def reduce( self: T, func: Callable, ignore_failures: bool = False ) -> tuple[T, np.ndarray]: @@ -1499,6 +1361,144 @@ def unstack(self, unstacker, fill_value) -> BlockManager: bm = BlockManager(new_blocks, [new_columns, new_index]) return bm + def to_dict(self, copy: bool = True): + """ + Return a dict of str(dtype) -> BlockManager + + Parameters + ---------- + copy : bool, default True + + Returns + ------- + values : a dict of dtype -> BlockManager + """ + + bd: dict[str, list[Block]] = {} + for b in self.blocks: + bd.setdefault(str(b.dtype), []).append(b) + + # TODO(EA2D): the combine will be unnecessary with 2D EAs + return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()} + + def as_array( + self, + transpose: bool = False, + dtype: Dtype | None = None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. + + Parameters + ---------- + transpose : bool, default False + If True, transpose the return array. + dtype : object, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. + + Returns + ------- + arr : ndarray + """ + if len(self.blocks) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr + + # We want to copy when na_value is provided to avoid + # mutating the original object + copy = copy or na_value is not lib.no_default + + if self.is_single_block: + blk = self.blocks[0] + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object + + # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no + # attribute "to_numpy" + arr = blk.values.to_numpy( # type: ignore[union-attr] + dtype=dtype, na_value=na_value + ).reshape(blk.shape) + else: + arr = np.asarray(blk.get_values()) + if dtype: + # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has + # incompatible type "Union[ExtensionDtype, str, dtype[Any], + # Type[object]]"; expected "Union[dtype[Any], None, type, + # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, + # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" + arr = arr.astype(dtype, copy=False) # type: ignore[arg-type] + else: + arr = self._interleave(dtype=dtype, na_value=na_value) + # The underlying data was copied within _interleave + copy = False + + if copy: + arr = arr.copy() + + if na_value is not lib.no_default: + arr[isna(arr)] = na_value + + return arr.transpose() if transpose else arr + + def _interleave( + self, dtype: Dtype | None = None, na_value=lib.no_default + ) -> np.ndarray: + """ + Return ndarray from blocks with specified item order + Items must be contained in the blocks + """ + if not dtype: + dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) + + # TODO: https://github.com/pandas-dev/pandas/issues/22791 + # Give EAs some input on what happens here. Sparse needs this. + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + elif is_extension_array_dtype(dtype): + dtype = "object" + elif is_dtype_equal(dtype, str): + dtype = "object" + + # error: Argument "dtype" to "empty" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object], None]"; expected + # "Union[dtype[Any], None, type, _SupportsDType, str, Union[Tuple[Any, int], + # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, + # Any]]]" + result = np.empty(self.shape, dtype=dtype) # type: ignore[arg-type] + + itemmask = np.zeros(self.shape[0]) + + for blk in self.blocks: + rl = blk.mgr_locs + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object + + # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no + # attribute "to_numpy" + arr = blk.values.to_numpy( # type: ignore[union-attr] + dtype=dtype, na_value=na_value + ) + else: + # error: Argument 1 to "get_values" of "Block" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object], None]"; expected + # "Union[dtype[Any], ExtensionDtype, None]" + arr = blk.get_values(dtype) # type: ignore[arg-type] + result[rl.indexer] = arr + itemmask[rl.indexer] = 1 + + if not itemmask.all(): + raise AssertionError("Some items were not contained in blocks") + + return result + class SingleBlockManager(BaseBlockManager, SingleDataManager): """ manage a single block with """ diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 3299503dbc3a4..08dba5aa76a2f 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -823,7 +823,7 @@ def test_equals_block_order_different_dtypes(self, mgr_string): def test_single_mgr_ctor(self): mgr = create_single_mgr("f8", num_rows=5) - assert mgr.as_array().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0] + assert mgr.external_values().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0] @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_args(self, value): @@ -837,6 +837,12 @@ def test_validate_bool_args(self, value): bm1.replace_list([1], [2], inplace=value) +def _as_array(mgr): + if mgr.ndim == 1: + return mgr.external_values() + return mgr.as_array() + + class TestIndexing: # Nosetests-style data-driven tests. # @@ -859,7 +865,7 @@ class TestIndexing: @pytest.mark.parametrize("mgr", MANAGERS) def test_get_slice(self, mgr): def assert_slice_ok(mgr, axis, slobj): - mat = mgr.as_array() + mat = _as_array(mgr) # we maybe using an ndarray to test slicing and # might not be the full length of the axis @@ -881,7 +887,7 @@ def assert_slice_ok(mgr, axis, slobj): mat_slobj = (slice(None),) * axis + (slobj,) tm.assert_numpy_array_equal( - mat[mat_slobj], sliced.as_array(), check_dtype=False + mat[mat_slobj], _as_array(sliced), check_dtype=False ) tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis]) @@ -919,10 +925,10 @@ def assert_slice_ok(mgr, axis, slobj): @pytest.mark.parametrize("mgr", MANAGERS) def test_take(self, mgr): def assert_take_ok(mgr, axis, indexer): - mat = mgr.as_array() + mat = _as_array(mgr) taken = mgr.take(indexer, axis) tm.assert_numpy_array_equal( - np.take(mat, indexer, axis), taken.as_array(), check_dtype=False + np.take(mat, indexer, axis), _as_array(taken), check_dtype=False ) tm.assert_index_equal(mgr.axes[axis].take(indexer), taken.axes[axis]) @@ -940,13 +946,13 @@ def assert_take_ok(mgr, axis, indexer): @pytest.mark.parametrize("fill_value", [None, np.nan, 100.0]) def test_reindex_axis(self, fill_value, mgr): def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): - mat = mgr.as_array() + mat = _as_array(mgr) indexer = mgr.axes[axis].get_indexer_for(new_labels) reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value) tm.assert_numpy_array_equal( algos.take_nd(mat, indexer, axis, fill_value=fill_value), - reindexed.as_array(), + _as_array(reindexed), check_dtype=False, ) tm.assert_index_equal(reindexed.axes[axis], new_labels) @@ -971,13 +977,13 @@ def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): @pytest.mark.parametrize("fill_value", [None, np.nan, 100.0]) def test_reindex_indexer(self, fill_value, mgr): def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): - mat = mgr.as_array() + mat = _as_array(mgr) reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value) reindexed = mgr.reindex_indexer( new_labels, indexer, axis, fill_value=fill_value ) tm.assert_numpy_array_equal( - reindexed_mat, reindexed.as_array(), check_dtype=False + reindexed_mat, _as_array(reindexed), check_dtype=False ) tm.assert_index_equal(reindexed.axes[axis], new_labels)