From 4502cdb33a6433b047a492f3e9ae7ecf8d5a16c1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Mar 2020 10:41:23 +0100 Subject: [PATCH 1/2] PERF: allow to skip validation/sanitization in DataFrame._from_arrays --- pandas/core/frame.py | 36 +++++++++++++++++++++++++-- pandas/core/internals/construction.py | 21 +++++++++------- 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b9e43b1cd9b05..8fc3e5fb089dd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1889,8 +1889,40 @@ def to_records( return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) @classmethod - def _from_arrays(cls, arrays, columns, index, dtype=None) -> "DataFrame": - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + def _from_arrays( + cls, arrays, columns, index, dtype=None, verify_integrity=True + ) -> "DataFrame": + """ + Create DataFrame from a list of arrays corresponding to the columns. + + Parameters + ---------- + arrays : list-like of arrays + Each array in the list corresponds to one column, in order. + columns : list-like, Index + The column names for the resulting DataFrame. + index : list-like, Index + The rows labels for the resulting DataFrame. + dtype : dtype, optional + Optional dtype to enforce for all arrays. + verify_integrity : bool, default True + Validate and homogenize all input. If set to False, it is assumed + that all elements of `arrays` are actual arrays to be stored in + a block, have the same length as and are aligned with the index, + and that `columns` and `index` are ensured to be an Index object. + + Returns + ------- + DataFrame + """ + mgr = arrays_to_mgr( + arrays, + columns, + index, + columns, + dtype=dtype, + verify_integrity=verify_integrity, + ) return cls(mgr) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c4416472d451c..3e0fb8455884a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -53,23 +53,26 @@ # BlockManager Interface -def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): +def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None, verify_integrity=True): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ - # figure out the index, if necessary - if index is None: - index = extract_index(arrays) - else: - index = ensure_index(index) + if verify_integrity: + # figure out the index, if necessary + if index is None: + index = extract_index(arrays) + else: + index = ensure_index(index) - # don't force copy because getting jammed in an ndarray anyway - arrays = _homogenize(arrays, index, dtype) + # don't force copy because getting jammed in an ndarray anyway + arrays = _homogenize(arrays, index, dtype) + + columns = ensure_index(columns) # from BlockManager perspective - axes = [ensure_index(columns), index] + axes = [columns, index] return create_block_manager_from_arrays(arrays, arr_names, axes) From a0d1c2769b08f1274f2c6d29bc8fece8bde0e538 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Mar 2020 11:17:54 +0100 Subject: [PATCH 2/2] clarify array type --- pandas/core/frame.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8fc3e5fb089dd..d1ba85c50d91d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1907,9 +1907,10 @@ def _from_arrays( Optional dtype to enforce for all arrays. verify_integrity : bool, default True Validate and homogenize all input. If set to False, it is assumed - that all elements of `arrays` are actual arrays to be stored in - a block, have the same length as and are aligned with the index, - and that `columns` and `index` are ensured to be an Index object. + that all elements of `arrays` are actual arrays how they will be + stored in a block (numpy ndarray or ExtensionArray), have the same + length as and are aligned with the index, and that `columns` and + `index` are ensured to be an Index object. Returns -------