diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 94750bb25733c..d9940d23ac66d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -700,7 +700,6 @@ def __init__( arrays, columns, index, - columns, dtype=dtype, typ=manager, ) @@ -746,9 +745,7 @@ def __init__( construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] - mgr = arrays_to_mgr( - values, columns, index, columns, dtype=None, typ=manager - ) + mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager) else: arr2d = construct_2d_arraylike_from_scalar( data, @@ -2025,6 +2022,26 @@ def from_records( if columns is not None: columns = ensure_index(columns) + def maybe_reorder( + arrays: list[ArrayLike], arr_columns: Index, columns: Index, index + ) -> tuple[list[ArrayLike], Index, Index | None]: + """ + If our desired 'columns' do not match the data's pre-existing 'arr_columns', + we re-order our arrays. This is like a pre-emptive (cheap) reindex. + """ + if len(arrays): + length = len(arrays[0]) + else: + length = 0 + + result_index = None + if len(arrays) == 0 and index is None and length == 0: + # for backward compat use an object Index instead of RangeIndex + result_index = Index([]) + + arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length) + return arrays, arr_columns, result_index + if is_iterator(data): if nrows == 0: return cls() @@ -2062,20 +2079,9 @@ def from_records( arr_columns_list.append(k) arrays.append(v) - if len(arrays): - length = len(arrays[0]) - elif index is not None: - length = len(index) - else: - length = 0 - arr_columns = Index(arr_columns_list) - if len(arrays) == 0 and index is None and length == 0: - # for backward compat use an object Index instead of RangeIndex - result_index = Index([]) - - arrays, arr_columns = reorder_arrays( - arrays, arr_columns, columns, length + arrays, arr_columns, result_index = maybe_reorder( + arrays, arr_columns, columns, index ) elif isinstance(data, (np.ndarray, DataFrame)): @@ -2097,6 +2103,10 @@ def from_records( arr_columns = ensure_index(arr_columns) if columns is None: columns = arr_columns + else: + arrays, arr_columns, result_index = maybe_reorder( + arrays, arr_columns, columns, index + ) if exclude is None: exclude = set() @@ -2130,7 +2140,7 @@ def from_records( columns = columns.drop(exclude) manager = get_option("mode.data_manager") - mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns, typ=manager) + mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) return cls(mgr) @@ -2343,7 +2353,6 @@ def _from_arrays( arrays, columns, index, - columns, dtype=dtype, verify_integrity=verify_integrity, typ=manager, diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 1f45c64eceb98..a859245b5a9fa 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -99,9 +99,8 @@ def arrays_to_mgr( arrays, - arr_names: Index, + columns: Index, index, - columns, *, dtype: DtypeObj | None = None, verify_integrity: bool = True, @@ -133,7 +132,7 @@ def arrays_to_mgr( if typ == "block": return create_block_manager_from_arrays( - arrays, arr_names, axes, consolidate=consolidate + arrays, columns, axes, consolidate=consolidate ) elif typ == "array": if len(columns) != len(arrays): @@ -187,7 +186,7 @@ def rec_array_to_mgr( if columns is None: columns = arr_columns - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype, typ=typ) + mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ) if copy: mgr = mgr.copy() @@ -226,7 +225,7 @@ def mgr_to_mgr(mgr, typ: str, copy: bool = True): else: if mgr.ndim == 2: new_mgr = arrays_to_mgr( - mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], typ="block" + mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block" ) else: new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index) @@ -288,7 +287,7 @@ def ndarray_to_mgr( else: columns = ensure_index(columns) - return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) + return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ @@ -409,7 +408,6 @@ def dict_to_mgr( from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) - data_names = arrays.index missing = arrays.isna() if index is None: # GH10856 @@ -433,11 +431,11 @@ def dict_to_mgr( arrays.loc[missing] = [val] * missing.sum() arrays = list(arrays) - data_names = ensure_index(columns) + columns = ensure_index(columns) else: keys = list(data.keys()) - columns = data_names = Index(keys) + columns = Index(keys) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies @@ -457,9 +455,7 @@ def dict_to_mgr( ] # TODO: can we get rid of the dt64tz special case above? - return arrays_to_mgr( - arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy - ) + return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy) def nested_data_to_arrays(