Skip to content

Commit 58181b1

Browse files
[ArrayManager] Enable read_parquet to not create 2D blocks when using ArrayManager (#40303)
1 parent ead9404 commit 58181b1

File tree

4 files changed

+47
-13
lines changed

4 files changed

+47
-13
lines changed

pandas/core/generic.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -286,13 +286,18 @@ def _from_mgr(cls, mgr: Manager):
286286
object.__setattr__(obj, "_attrs", {})
287287
return obj
288288

289-
def _as_manager(self: FrameOrSeries, typ: str) -> FrameOrSeries:
289+
def _as_manager(
290+
self: FrameOrSeries, typ: str, copy: bool_t = True
291+
) -> FrameOrSeries:
290292
"""
291293
Private helper function to create a DataFrame with specific manager.
292294
293295
Parameters
294296
----------
295297
typ : {"block", "array"}
298+
copy : bool, default True
299+
Only controls whether the conversion from Block->ArrayManager
300+
copies the 1D arrays (to ensure proper/contiguous memory layout).
296301
297302
Returns
298303
-------
@@ -301,7 +306,7 @@ def _as_manager(self: FrameOrSeries, typ: str) -> FrameOrSeries:
301306
to be a copy or not.
302307
"""
303308
new_mgr: Manager
304-
new_mgr = mgr_to_mgr(self._mgr, typ=typ)
309+
new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
305310
# fastpath of passing a manager doesn't check the option/manager class
306311
return self._constructor(new_mgr).__finalize__(self)
307312

pandas/core/internals/construction.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -209,10 +209,11 @@ def fill_masked_arrays(data: MaskedRecords, arr_columns: Index) -> list[np.ndarr
209209
return new_arrays
210210

211211

212-
def mgr_to_mgr(mgr, typ: str):
212+
def mgr_to_mgr(mgr, typ: str, copy: bool = True):
213213
"""
214214
Convert to specific type of Manager. Does not copy if the type is already
215-
correct. Does not guarantee a copy otherwise.
215+
correct. Does not guarantee a copy otherwise. `copy` keyword only controls
216+
whether conversion from Block->ArrayManager copies the 1D arrays.
216217
"""
217218
new_mgr: Manager
218219

@@ -231,10 +232,15 @@ def mgr_to_mgr(mgr, typ: str):
231232
new_mgr = mgr
232233
else:
233234
if mgr.ndim == 2:
234-
arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))]
235+
arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))]
236+
if copy:
237+
arrays = [arr.copy() for arr in arrays]
235238
new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
236239
else:
237-
new_mgr = SingleArrayManager([mgr.internal_values()], [mgr.index])
240+
array = mgr.internal_values()
241+
if copy:
242+
array = array.copy()
243+
new_mgr = SingleArrayManager([array], [mgr.index])
238244
else:
239245
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
240246
return new_mgr

pandas/io/parquet.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,9 @@ def read(
231231
"'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
232232
f"({self.api.__version__} is installed"
233233
)
234+
manager = get_option("mode.data_manager")
235+
if manager == "array":
236+
to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment]
234237

235238
path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
236239
path,
@@ -239,9 +242,12 @@ def read(
239242
mode="rb",
240243
)
241244
try:
242-
return self.api.parquet.read_table(
245+
result = self.api.parquet.read_table(
243246
path_or_handle, columns=columns, **kwargs
244247
).to_pandas(**to_pandas_kwargs)
248+
if manager == "array":
249+
result = result._as_manager("array", copy=False)
250+
return result
245251
finally:
246252
if handles is not None:
247253
handles.close()

pandas/tests/io/test_parquet.py

+23-6
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import numpy as np
1010
import pytest
1111

12+
from pandas._config import get_option
13+
1214
from pandas.compat import (
1315
PY38,
1416
is_platform_windows,
@@ -41,20 +43,21 @@
4143
_HAVE_FASTPARQUET = False
4244

4345

44-
pytestmark = [
45-
pytest.mark.filterwarnings("ignore:RangeIndex.* is deprecated:DeprecationWarning"),
46-
# TODO(ArrayManager) fastparquet / pyarrow rely on BlockManager internals
47-
td.skip_array_manager_not_yet_implemented,
48-
]
46+
pytestmark = pytest.mark.filterwarnings(
47+
"ignore:RangeIndex.* is deprecated:DeprecationWarning"
48+
)
49+
4950

51+
# TODO(ArrayManager) fastparquet relies on BlockManager internals
5052

5153
# setup engines & skips
5254
@pytest.fixture(
5355
params=[
5456
pytest.param(
5557
"fastparquet",
5658
marks=pytest.mark.skipif(
57-
not _HAVE_FASTPARQUET, reason="fastparquet is not installed"
59+
not _HAVE_FASTPARQUET or get_option("mode.data_manager") == "array",
60+
reason="fastparquet is not installed or ArrayManager is used",
5861
),
5962
),
6063
pytest.param(
@@ -80,6 +83,8 @@ def pa():
8083
def fp():
8184
if not _HAVE_FASTPARQUET:
8285
pytest.skip("fastparquet is not installed")
86+
elif get_option("mode.data_manager") == "array":
87+
pytest.skip("ArrayManager is not supported with fastparquet")
8388
return "fastparquet"
8489

8590

@@ -923,6 +928,18 @@ def test_filter_row_groups(self, pa):
923928
)
924929
assert len(result) == 1
925930

931+
def test_read_parquet_manager(self, pa, using_array_manager):
932+
# ensure that read_parquet honors the pandas.options.mode.data_manager option
933+
df = pd.DataFrame(np.random.randn(10, 3), columns=["A", "B", "C"])
934+
935+
with tm.ensure_clean() as path:
936+
df.to_parquet(path, pa)
937+
result = read_parquet(path, pa)
938+
if using_array_manager:
939+
assert isinstance(result._mgr, pd.core.internals.ArrayManager)
940+
else:
941+
assert isinstance(result._mgr, pd.core.internals.BlockManager)
942+
926943

927944
class TestParquetFastParquet(Base):
928945
def test_basic(self, fp, df_full):

0 commit comments

Comments
 (0)