Skip to content

ENH: allow non-consolidation in constructors #36894

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 71 additions & 11 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,8 @@ class DataFrame(NDFrame):
Data type to force. Only a single dtype is allowed. If None, infer.
copy : bool, default False
Copy data from inputs. Only affects DataFrame / 2d ndarray input.
consolidate : bool or None, default None
Whether to consolidate the arrays in the new DataFrame.

See Also
--------
Expand Down Expand Up @@ -437,12 +439,16 @@ def __init__(
columns: Optional[Axes] = None,
dtype: Optional[Dtype] = None,
copy: bool = False,
consolidate=None,
):
if data is None:
data = {}
if dtype is not None:
dtype = self._validate_dtype(dtype)

if consolidate is None:
consolidate = not copy

if isinstance(data, DataFrame):
data = data._mgr

Expand All @@ -457,7 +463,7 @@ def __init__(
)

elif isinstance(data, dict):
mgr = init_dict(data, index, columns, dtype=dtype)
mgr = init_dict(data, index, columns, dtype=dtype, consolidate=consolidate)
elif isinstance(data, ma.MaskedArray):
import numpy.ma.mrecords as mrecords

Expand All @@ -474,19 +480,41 @@ def __init__(
data[mask] = fill_value
else:
data = data.copy()
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
mgr = init_ndarray(
data,
index,
columns,
dtype=dtype,
copy=copy,
consolidate=consolidate,
)

elif isinstance(data, (np.ndarray, Series, Index)):
if data.dtype.names:
data_columns = list(data.dtype.names)
data = {k: data[k] for k in data_columns}
if columns is None:
columns = data_columns
mgr = init_dict(data, index, columns, dtype=dtype)
mgr = init_dict(
data, index, columns, dtype=dtype, consolidate=consolidate
)
elif getattr(data, "name", None) is not None:
mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
mgr = init_dict(
{data.name: data},
index,
columns,
dtype=dtype,
consolidate=consolidate,
)
else:
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
mgr = init_ndarray(
data,
index,
columns,
dtype=dtype,
copy=copy,
consolidate=consolidate,
)

# For data is list-like, or Iterable (will consume into list)
elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
Expand All @@ -510,11 +538,27 @@ def __init__(
else:
index = ibase.default_index(len(data))

mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
mgr = arrays_to_mgr(
arrays,
columns,
index,
columns,
dtype=dtype,
consolidate=consolidate,
)
else:
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
mgr = init_ndarray(
data,
index,
columns,
dtype=dtype,
copy=copy,
consolidate=consolidate,
)
else:
mgr = init_dict({}, index, columns, dtype=dtype)
mgr = init_dict(
{}, index, columns, dtype=dtype, consolidate=consolidate
)
# For data is scalar
else:
if index is None or columns is None:
Expand All @@ -530,7 +574,9 @@ def __init__(
construct_1d_arraylike_from_scalar(data, len(index), dtype)
for _ in range(len(columns))
]
mgr = arrays_to_mgr(values, columns, index, columns, dtype=None)
mgr = arrays_to_mgr(
values, columns, index, columns, dtype=None, consolidate=consolidate
)
else:
# Attempt to coerce to a numpy array
try:
Expand All @@ -550,7 +596,12 @@ def __init__(
)

mgr = init_ndarray(
values, index, columns, dtype=values.dtype, copy=False
values,
index,
columns,
dtype=values.dtype,
copy=False,
consolidate=consolidate,
)

NDFrame.__init__(self, mgr)
Expand Down Expand Up @@ -1665,6 +1716,7 @@ def from_records(
columns=None,
coerce_float=False,
nrows=None,
consolidate: bool = True,
) -> DataFrame:
"""
Convert structured or record ndarray to DataFrame.
Expand Down Expand Up @@ -1692,6 +1744,8 @@ def from_records(
decimal.Decimal) to floating point, useful for SQL result sets.
nrows : int, default None
Number of rows to read if data is an iterator.
consolidate: bool, default True
Whether to consolidate the arrays in the new DataFrame.

Returns
-------
Expand Down Expand Up @@ -1827,7 +1881,9 @@ def from_records(
arr_columns = arr_columns.drop(arr_exclude)
columns = columns.drop(exclude)

mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
mgr = arrays_to_mgr(
arrays, arr_columns, result_index, columns, consolidate=consolidate
)

return cls(mgr)

Expand Down Expand Up @@ -2006,6 +2062,7 @@ def _from_arrays(
index,
dtype: Optional[Dtype] = None,
verify_integrity: bool = True,
consolidate: bool = True,
) -> DataFrame:
"""
Create DataFrame from a list of arrays corresponding to the columns.
Expand All @@ -2026,6 +2083,8 @@ def _from_arrays(
stored in a block (numpy ndarray or ExtensionArray), have the same
length as and are aligned with the index, and that `columns` and
`index` are ensured to be an Index object.
consolidate: bool, default True
Whether to consolidate the passed arrays in the new DataFrame.

Returns
-------
Expand All @@ -2041,6 +2100,7 @@ def _from_arrays(
columns,
dtype=dtype,
verify_integrity=verify_integrity,
consolidate=consolidate,
)
return cls(mgr)

Expand Down
42 changes: 34 additions & 8 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def arrays_to_mgr(
columns,
dtype: Optional[DtypeObj] = None,
verify_integrity: bool = True,
consolidate: bool = True,
):
"""
Segregate Series based on type and coerce into matrices.
Expand All @@ -91,7 +92,9 @@ def arrays_to_mgr(
# from BlockManager perspective
axes = [columns, index]

return create_block_manager_from_arrays(arrays, arr_names, axes)
return create_block_manager_from_arrays(
arrays, arr_names, axes, consolidate=consolidate
)


def masked_rec_array_to_mgr(
Expand Down Expand Up @@ -130,7 +133,9 @@ def masked_rec_array_to_mgr(
if columns is None:
columns = arr_columns

mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype)
mgr = arrays_to_mgr(
arrays, arr_columns, index, columns, dtype, consolidate=True
) # FIXME: dont hardcode

if copy:
mgr = mgr.copy()
Expand All @@ -141,7 +146,14 @@ def masked_rec_array_to_mgr(
# DataFrame Constructor Interface


def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
def init_ndarray(
values,
index,
columns,
dtype: Optional[DtypeObj],
copy: bool,
consolidate: bool = True,
):
# input must be a ndarray, list, Series, index

if isinstance(values, ABCSeries):
Expand Down Expand Up @@ -170,7 +182,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
values = values.copy()

index, columns = _get_axes(len(values), 1, index, columns)
return arrays_to_mgr([values], columns, index, columns, dtype=dtype)
return arrays_to_mgr(
[values], columns, index, columns, dtype=dtype, consolidate=consolidate
)
elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype):
# GH#19157

Expand All @@ -184,7 +198,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
if columns is None:
columns = Index(range(len(values)))

return arrays_to_mgr(values, columns, index, columns, dtype=dtype)
return arrays_to_mgr(
values, columns, index, columns, dtype=dtype, consolidate=consolidate
)

# by definition an array here
# the dtypes will be coerced to a single dtype
Expand Down Expand Up @@ -233,10 +249,18 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
else:
block_values = [values]

return create_block_manager_from_blocks(block_values, [columns, index])
return create_block_manager_from_blocks(
block_values, [columns, index], consolidate=consolidate
)


def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
def init_dict(
data: Dict,
index,
columns,
dtype: Optional[DtypeObj] = None,
consolidate: bool = True,
):
"""
Segregate Series based on type and coerce into matrices.
Needs to handle a lot of exceptional cases.
Expand Down Expand Up @@ -282,7 +306,9 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
arrays = [
arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
]
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
return arrays_to_mgr(
arrays, data_names, index, columns, dtype=dtype, consolidate=consolidate
)


# ---------------------------------------------------------------------
Expand Down
17 changes: 12 additions & 5 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1595,7 +1595,9 @@ def fast_xs(self, loc):
# Constructor Helpers


def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager:
def create_block_manager_from_blocks(
blocks, axes: List[Index], consolidate: bool = True
) -> BlockManager:
try:
if len(blocks) == 1 and not isinstance(blocks[0], Block):
# if blocks[0] is of length 0, return empty blocks
Expand All @@ -1610,7 +1612,8 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager:
]

mgr = BlockManager(blocks, axes)
mgr._consolidate_inplace()
if consolidate:
mgr._consolidate_inplace()
return mgr

except ValueError as e:
Expand All @@ -1620,7 +1623,10 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager:


def create_block_manager_from_arrays(
arrays, names: Index, axes: List[Index]
arrays,
names: Index,
axes: List[Index],
consolidate: bool = True,
) -> BlockManager:
assert isinstance(names, Index)
assert isinstance(axes, list)
Expand All @@ -1629,10 +1635,11 @@ def create_block_manager_from_arrays(
try:
blocks = form_blocks(arrays, names, axes)
mgr = BlockManager(blocks, axes)
mgr._consolidate_inplace()
return mgr
except ValueError as e:
raise construction_error(len(arrays), arrays[0].shape, axes, e)
if consolidate:
mgr._consolidate_inplace()
return mgr


def construction_error(tot_items, block_shape, axes, e=None):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def dispatch_to_series(left, right, func, axis: Optional[int] = None):
raise NotImplementedError(right)

return type(left)._from_arrays(
arrays, left.columns, left.index, verify_integrity=False
arrays, left.columns, left.index, verify_integrity=False, consolidate=False
)


Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/arithmetic/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,9 @@ def mismatched_freq(request):
# ------------------------------------------------------------------


@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame], ids=id_func)
@pytest.fixture(
params=[pd.Index, pd.Series, pd.DataFrame], ids=id_func # type: ignore[list-item]
)
def box(request):
"""
Several array-like containers that should have effectively identical
Expand Down