diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 54bd4220bc060..28bd4db3a15ca 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -142,6 +142,9 @@ def __from_arrow__( return BooleanArray._concat_same_type(results) +_boolean_dtype = BooleanDtype() + + def coerce_to_array( values, mask=None, copy: bool = False ) -> tuple[np.ndarray, np.ndarray]: @@ -299,7 +302,7 @@ def __init__( "values should be boolean numpy array. Use " "the 'pd.array' function instead" ) - self._dtype = BooleanDtype() + self._dtype = _boolean_dtype super().__init__(values, mask, copy=copy) @property diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2be1c62cde2ec..921dced221923 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -107,7 +107,10 @@ needs_i8_conversion, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + BaseMaskedDtype, + ExtensionDtype, +) from pandas.core.dtypes.missing import ( isna, notna, @@ -2505,6 +2508,7 @@ def _from_arrays( index, dtype: Dtype | None = None, verify_integrity: bool = True, + is_1d_ea_only: bool = False, ) -> Self: """ Create DataFrame from a list of arrays corresponding to the columns. @@ -2544,6 +2548,7 @@ def _from_arrays( dtype=dtype, verify_integrity=verify_integrity, typ=manager, + is_1d_ea_only=is_1d_ea_only, ) return cls(mgr) @@ -3610,11 +3615,21 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: # We have EAs with the same dtype. We can preserve that dtype in transpose. dtype = dtypes[0] arr_type = dtype.construct_array_type() - values = self.values + if isinstance(dtype, BaseMaskedDtype): + data, mask = self._mgr.as_array_masked() + new_values = [arr_type(data[i], mask[i]) for i in range(self.shape[0])] + else: + values = self.values + new_values = [ + arr_type._from_sequence(row, dtype=dtype) for row in values + ] - new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] result = type(self)._from_arrays( - new_values, index=self.columns, columns=self.index + new_values, + index=self.columns, + columns=self.index, + verify_integrity=False, + is_1d_ea_only=True, ) else: @@ -10935,7 +10950,23 @@ def _get_data() -> DataFrame: df = _get_data() if axis is None: return func(df.values) - elif axis == 1: + + # if len(df._mgr) > 0: + # common_dtype = find_common_type(list(df._mgr.get_dtypes())) + # is_masked_ea = isinstance(common_dtype, BaseMaskedDtype) + # is_np = isinstance(common_dtype, np.dtype) + # else: + # common_dtype = None + + # if axis == 1 and common_dtype and is_masked_ea: + # data, mask = self._mgr.as_array_masked() + # ea2d = common_dtype.construct_array_type()(data, mask) + # result = ea2d._reduce(name, axis=axis, skipna=skipna, **kwds) + # labels = self._get_agg_axis(axis) + # result = self._constructor_sliced(result, index=labels, copy=False) + # return result + + if axis == 1: if len(df.index) == 0: # Taking a transpose would result in no columns, losing the dtype. # In the empty case, reducing along axis 0 or 1 gives the same diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b114b8a1aa7aa..c13fd1719afaa 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -104,6 +104,7 @@ def arrays_to_mgr( verify_integrity: bool = True, typ: str | None = None, consolidate: bool = True, + is_1d_ea_only: bool = False, ) -> Manager: """ Segregate Series based on type and coerce into matrices. @@ -127,7 +128,8 @@ def arrays_to_mgr( else: index = ensure_index(index) - arrays = [extract_array(x, extract_numpy=True) for x in arrays] + if not is_1d_ea_only: + arrays = [extract_array(x, extract_numpy=True) for x in arrays] # with _from_arrays, the passed arrays should never be Series objects refs = [None] * len(arrays) @@ -152,7 +154,7 @@ def arrays_to_mgr( if typ == "block": return create_block_manager_from_column_arrays( - arrays, axes, consolidate=consolidate, refs=refs + arrays, axes, consolidate, refs=refs, is_1d_ea_only=is_1d_ea_only ) elif typ == "array": return ArrayManager(arrays, [index, columns]) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 70d7920ac5bb2..29404fab0461e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1747,6 +1747,40 @@ def as_array( return arr.transpose() + def as_array_masked(self) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. + + Returns + ------- + arr : ndarray + """ + # # TODO(CoW) handle case where resulting array is a view + # if len(self.blocks) == 0: + # arr = np.empty(self.shape, dtype=float) + # return arr.transpose() + + # TODO we already established we only have a single dtype, but this + # could be generalized to be a mix of all masked dtypes + dtype = self.blocks[0].dtype.numpy_dtype + + result_data = np.empty(self.shape, dtype=dtype) + result_mask = np.empty(self.shape, dtype="bool") + + itemmask = np.zeros(self.shape[0]) + + for blk in self.blocks: + rl = blk.mgr_locs + arr = blk.values + result_data[rl.indexer] = arr._data + result_mask[rl.indexer] = arr._mask + itemmask[rl.indexer] = 1 + + if not itemmask.all(): + raise AssertionError("Some items were not contained in blocks") + + return result_data.transpose(), result_mask.transpose() + def _interleave( self, dtype: np.dtype | None = None, @@ -2130,6 +2164,7 @@ def create_block_manager_from_column_arrays( axes: list[Index], consolidate: bool, refs: list, + is_1d_ea_only: bool = False, ) -> BlockManager: # Assertions disabled for performance (caller is responsible for verifying) # assert isinstance(axes, list) @@ -2143,7 +2178,7 @@ def create_block_manager_from_column_arrays( # verify_integrity=False below. try: - blocks = _form_blocks(arrays, consolidate, refs) + blocks = _form_blocks(arrays, consolidate, refs, is_1d_ea_only) mgr = BlockManager(blocks, axes, verify_integrity=False) except ValueError as e: raise_construction_error(len(arrays), arrays[0].shape, axes, e) @@ -2197,9 +2232,17 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]: return sep, isinstance(dtype, np.dtype), dtype -def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]: +def _form_blocks( + arrays: list[ArrayLike], consolidate: bool, refs: list, is_1d_ea_only: bool +) -> list[Block]: tuples = list(enumerate(arrays)) + if is_1d_ea_only: + block_type = get_block_type(arrays[0].dtype) + return [ + block_type(arr, placement=BlockPlacement(i), ndim=2) for i, arr in tuples + ] + if not consolidate: nbs = _tuples_to_blocks_no_consolidate(tuples, refs) return nbs