From cbc97f0e7ac49f5a78eb982f933281dc879a7e2a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 5 Oct 2020 10:45:28 -0700 Subject: [PATCH 1/2] ENH: allow non-consolidation in constructors --- pandas/core/frame.py | 82 +++++++++++++++++++++++---- pandas/core/internals/construction.py | 42 +++++++++++--- pandas/core/internals/managers.py | 17 ++++-- pandas/core/ops/__init__.py | 2 +- 4 files changed, 118 insertions(+), 25 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1f9987d9d3f5b..4af6097a51f73 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -361,6 +361,8 @@ class DataFrame(NDFrame): Data type to force. Only a single dtype is allowed. If None, infer. copy : bool, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input. + consolidate : bool or None, default None + Whether to consolidate the arrays in the new DataFrame. See Also -------- @@ -437,12 +439,16 @@ def __init__( columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, copy: bool = False, + consolidate=None, ): if data is None: data = {} if dtype is not None: dtype = self._validate_dtype(dtype) + if consolidate is None: + consolidate = not copy + if isinstance(data, DataFrame): data = data._mgr @@ -457,7 +463,7 @@ def __init__( ) elif isinstance(data, dict): - mgr = init_dict(data, index, columns, dtype=dtype) + mgr = init_dict(data, index, columns, dtype=dtype, consolidate=consolidate) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords @@ -474,7 +480,14 @@ def __init__( data[mask] = fill_value else: data = data.copy() - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = init_ndarray( + data, + index, + columns, + dtype=dtype, + copy=copy, + consolidate=consolidate, + ) elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: @@ -482,11 +495,26 @@ def __init__( data = {k: data[k] for k in data_columns} if columns is None: columns = data_columns - mgr = init_dict(data, index, columns, dtype=dtype) + mgr = init_dict( + data, index, columns, dtype=dtype, consolidate=consolidate + ) elif getattr(data, "name", None) is not None: - mgr = init_dict({data.name: data}, index, columns, dtype=dtype) + mgr = init_dict( + {data.name: data}, + index, + columns, + dtype=dtype, + consolidate=consolidate, + ) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = init_ndarray( + data, + index, + columns, + dtype=dtype, + copy=copy, + consolidate=consolidate, + ) # For data is list-like, or Iterable (will consume into list) elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): @@ -510,11 +538,27 @@ def __init__( else: index = ibase.default_index(len(data)) - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + mgr = arrays_to_mgr( + arrays, + columns, + index, + columns, + dtype=dtype, + consolidate=consolidate, + ) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = init_ndarray( + data, + index, + columns, + dtype=dtype, + copy=copy, + consolidate=consolidate, + ) else: - mgr = init_dict({}, index, columns, dtype=dtype) + mgr = init_dict( + {}, index, columns, dtype=dtype, consolidate=consolidate + ) # For data is scalar else: if index is None or columns is None: @@ -530,7 +574,9 @@ def __init__( construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] - mgr = arrays_to_mgr(values, columns, index, columns, dtype=None) + mgr = arrays_to_mgr( + values, columns, index, columns, dtype=None, consolidate=consolidate + ) else: # Attempt to coerce to a numpy array try: @@ -550,7 +596,12 @@ def __init__( ) mgr = init_ndarray( - values, index, columns, dtype=values.dtype, copy=False + values, + index, + columns, + dtype=values.dtype, + copy=False, + consolidate=consolidate, ) NDFrame.__init__(self, mgr) @@ -1665,6 +1716,7 @@ def from_records( columns=None, coerce_float=False, nrows=None, + consolidate: bool = True, ) -> DataFrame: """ Convert structured or record ndarray to DataFrame. @@ -1692,6 +1744,8 @@ def from_records( decimal.Decimal) to floating point, useful for SQL result sets. nrows : int, default None Number of rows to read if data is an iterator. + consolidate: bool, default True + Whether to consolidate the arrays in the new DataFrame. Returns ------- @@ -1827,7 +1881,9 @@ def from_records( arr_columns = arr_columns.drop(arr_exclude) columns = columns.drop(exclude) - mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) + mgr = arrays_to_mgr( + arrays, arr_columns, result_index, columns, consolidate=consolidate + ) return cls(mgr) @@ -2006,6 +2062,7 @@ def _from_arrays( index, dtype: Optional[Dtype] = None, verify_integrity: bool = True, + consolidate: bool = True, ) -> DataFrame: """ Create DataFrame from a list of arrays corresponding to the columns. @@ -2026,6 +2083,8 @@ def _from_arrays( stored in a block (numpy ndarray or ExtensionArray), have the same length as and are aligned with the index, and that `columns` and `index` are ensured to be an Index object. + consolidate: bool, default True + Whether to consolidate the passed arrays in the new DataFrame. Returns ------- @@ -2041,6 +2100,7 @@ def _from_arrays( columns, dtype=dtype, verify_integrity=verify_integrity, + consolidate=consolidate, ) return cls(mgr) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 6244f1bf0a2d2..618e06ad30d8a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -65,6 +65,7 @@ def arrays_to_mgr( columns, dtype: Optional[DtypeObj] = None, verify_integrity: bool = True, + consolidate: bool = True, ): """ Segregate Series based on type and coerce into matrices. @@ -91,7 +92,9 @@ def arrays_to_mgr( # from BlockManager perspective axes = [columns, index] - return create_block_manager_from_arrays(arrays, arr_names, axes) + return create_block_manager_from_arrays( + arrays, arr_names, axes, consolidate=consolidate + ) def masked_rec_array_to_mgr( @@ -130,7 +133,9 @@ def masked_rec_array_to_mgr( if columns is None: columns = arr_columns - mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) + mgr = arrays_to_mgr( + arrays, arr_columns, index, columns, dtype, consolidate=True + ) # FIXME: dont hardcode if copy: mgr = mgr.copy() @@ -141,7 +146,14 @@ def masked_rec_array_to_mgr( # DataFrame Constructor Interface -def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): +def init_ndarray( + values, + index, + columns, + dtype: Optional[DtypeObj], + copy: bool, + consolidate: bool = True, +): # input must be a ndarray, list, Series, index if isinstance(values, ABCSeries): @@ -170,7 +182,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): values = values.copy() index, columns = _get_axes(len(values), 1, index, columns) - return arrays_to_mgr([values], columns, index, columns, dtype=dtype) + return arrays_to_mgr( + [values], columns, index, columns, dtype=dtype, consolidate=consolidate + ) elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype): # GH#19157 @@ -184,7 +198,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): if columns is None: columns = Index(range(len(values))) - return arrays_to_mgr(values, columns, index, columns, dtype=dtype) + return arrays_to_mgr( + values, columns, index, columns, dtype=dtype, consolidate=consolidate + ) # by definition an array here # the dtypes will be coerced to a single dtype @@ -233,10 +249,18 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): else: block_values = [values] - return create_block_manager_from_blocks(block_values, [columns, index]) + return create_block_manager_from_blocks( + block_values, [columns, index], consolidate=consolidate + ) -def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): +def init_dict( + data: Dict, + index, + columns, + dtype: Optional[DtypeObj] = None, + consolidate: bool = True, +): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. @@ -282,7 +306,9 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] - return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) + return arrays_to_mgr( + arrays, data_names, index, columns, dtype=dtype, consolidate=consolidate + ) # --------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f2480adce89b4..2a08e2dcbe136 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1595,7 +1595,9 @@ def fast_xs(self, loc): # Constructor Helpers -def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: +def create_block_manager_from_blocks( + blocks, axes: List[Index], consolidate: bool = True +) -> BlockManager: try: if len(blocks) == 1 and not isinstance(blocks[0], Block): # if blocks[0] is of length 0, return empty blocks @@ -1610,7 +1612,8 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: ] mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() + if consolidate: + mgr._consolidate_inplace() return mgr except ValueError as e: @@ -1620,7 +1623,10 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: def create_block_manager_from_arrays( - arrays, names: Index, axes: List[Index] + arrays, + names: Index, + axes: List[Index], + consolidate: bool = True, ) -> BlockManager: assert isinstance(names, Index) assert isinstance(axes, list) @@ -1629,10 +1635,11 @@ def create_block_manager_from_arrays( try: blocks = form_blocks(arrays, names, axes) mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() - return mgr except ValueError as e: raise construction_error(len(arrays), arrays[0].shape, axes, e) + if consolidate: + mgr._consolidate_inplace() + return mgr def construction_error(tot_items, block_shape, axes, e=None): diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 36e3a0e37c1ae..c8a968e61983f 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -272,7 +272,7 @@ def dispatch_to_series(left, right, func, axis: Optional[int] = None): raise NotImplementedError(right) return type(left)._from_arrays( - arrays, left.columns, left.index, verify_integrity=False + arrays, left.columns, left.index, verify_integrity=False, consolidate=False ) From 5c94129f69f33e40e9edfcd58e3f980aa42b6360 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 5 Oct 2020 15:00:42 -0700 Subject: [PATCH 2/2] mypy fixup --- pandas/tests/arithmetic/conftest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 6286711ac6113..c808644e007b0 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -221,7 +221,9 @@ def mismatched_freq(request): # ------------------------------------------------------------------ -@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame], ids=id_func) +@pytest.fixture( + params=[pd.Index, pd.Series, pd.DataFrame], ids=id_func # type: ignore[list-item] +) def box(request): """ Several array-like containers that should have effectively identical