Skip to content

[ArrayManager] Enable read_parquet to not create 2D blocks when using ArrayManager #40303

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 26, 2021
7 changes: 5 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,13 +668,16 @@ def __init__(

NDFrame.__init__(self, mgr)

def _as_manager(self, typ: str) -> DataFrame:
def _as_manager(self, typ: str, copy: bool = True) -> DataFrame:
"""
Private helper function to create a DataFrame with specific manager.

Parameters
----------
typ : {"block", "array"}
copy : bool, default True
Only controls whether the conversion from Block->ArrayManager
copies the 1D arrays (to ensure proper/contiguous memory layout).

Returns
-------
Expand All @@ -683,7 +686,7 @@ def _as_manager(self, typ: str) -> DataFrame:
to be a copy or not.
"""
new_mgr: Manager
new_mgr = mgr_to_mgr(self._mgr, typ=typ)
new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
# fastpath of passing a manager doesn't check the option/manager class
return DataFrame(new_mgr)

Expand Down
10 changes: 7 additions & 3 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,11 @@ def fill_masked_arrays(data: MaskedRecords, arr_columns: Index) -> List[np.ndarr
return new_arrays


def mgr_to_mgr(mgr, typ: str):
def mgr_to_mgr(mgr, typ: str, copy: bool = True):
"""
Convert to specific type of Manager. Does not copy if the type is already
correct. Does not guarantee a copy otherwise.
correct. Does not guarantee a copy otherwise. `copy` keyword only controls
whether conversion from Block->ArrayManager copies the 1D arrays.
"""
new_mgr: Manager

Expand All @@ -209,7 +210,10 @@ def mgr_to_mgr(mgr, typ: str):
if isinstance(mgr, ArrayManager):
new_mgr = mgr
else:
arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))]
if copy:
arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))]
else:
arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))]
new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
else:
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
Expand Down
8 changes: 7 additions & 1 deletion pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ def read(
"'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
f"({self.api.__version__} is installed"
)
manager = get_option("mode.data_manager")
if manager == "array":
to_pandas_kwargs["split_blocks"] = True

path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
path,
Expand All @@ -243,9 +246,12 @@ def read(
mode="rb",
)
try:
return self.api.parquet.read_table(
result = self.api.parquet.read_table(
path_or_handle, columns=columns, **kwargs
).to_pandas(**to_pandas_kwargs)
if manager == "array":
result = result._as_manager("array", copy=False)
return result
finally:
if handles is not None:
handles.close()
Expand Down