Skip to content

Commit 51c4297

Browse files
authored
PERF: DataFrame constructor (#43142)
1 parent 5410fb7 commit 51c4297

File tree

5 files changed

+42
-11
lines changed

5 files changed

+42
-11
lines changed

pandas/core/internals/construction.py

+20
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,30 @@ def arrays_to_mgr(
121121

122122
# don't force copy because getting jammed in an ndarray anyway
123123
arrays = _homogenize(arrays, index, dtype)
124+
# _homogenize ensures
125+
# - all(len(x) == len(index) for x in arrays)
126+
# - all(x.ndim == 1 for x in arrays)
127+
# - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)
128+
# - all(type(x) is not PandasArray for x in arrays)
124129

125130
else:
126131
index = ensure_index(index)
127132

133+
# Reached via DataFrame._from_arrays; we do validation here
134+
for arr in arrays:
135+
if (
136+
not isinstance(arr, (np.ndarray, ExtensionArray))
137+
or arr.ndim != 1
138+
or len(arr) != len(index)
139+
):
140+
raise ValueError(
141+
"Arrays must be 1-dimensional np.ndarray or ExtensionArray "
142+
"with length matching len(index)"
143+
)
144+
128145
columns = ensure_index(columns)
146+
if len(columns) != len(arrays):
147+
raise ValueError("len(arrays) must match len(columns)")
129148

130149
# from BlockManager perspective
131150
axes = [columns, index]
@@ -581,6 +600,7 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
581600
val = sanitize_array(
582601
val, index, dtype=dtype, copy=False, raise_cast_failure=False
583602
)
603+
com.require_length_match(val, index)
584604

585605
homogenized.append(val)
586606

pandas/core/internals/managers.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -1813,15 +1813,20 @@ def create_block_manager_from_column_arrays(
18131813
axes: list[Index],
18141814
consolidate: bool = True,
18151815
) -> BlockManager:
1816-
# Assertions disabled for performance
1816+
# Assertions disabled for performance (caller is responsible for verifying)
18171817
# assert isinstance(axes, list)
18181818
# assert all(isinstance(x, Index) for x in axes)
1819+
# assert all(x.ndim == 1 for x in arrays)
1820+
# assert all(len(x) == len(axes[1]) for x in arrays)
1821+
# assert len(arrays) == len(axes[0])
1822+
# These last three are sufficient to allow us to safely pass
1823+
# verify_integrity=False below.
18191824

18201825
arrays = [extract_array(x, extract_numpy=True) for x in arrays]
18211826

18221827
try:
18231828
blocks = _form_blocks(arrays, consolidate)
1824-
mgr = BlockManager(blocks, axes)
1829+
mgr = BlockManager(blocks, axes, verify_integrity=False)
18251830
except ValueError as e:
18261831
raise construction_error(len(arrays), arrays[0].shape, axes, e)
18271832
if consolidate:

pandas/tests/frame/constructors/test_from_records.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,7 @@ def test_from_records_bad_index_column(self):
188188
# should fail
189189
msg = "|".join(
190190
[
191-
r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)",
192-
"Passed arrays should have the same length as the rows Index: 10 vs 1",
191+
r"Length of values \(10\) does not match length of index \(1\)",
193192
]
194193
)
195194
with pytest.raises(ValueError, match=msg):
@@ -268,8 +267,7 @@ def test_from_records_to_records(self):
268267
# wrong length
269268
msg = "|".join(
270269
[
271-
r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)",
272-
"Passed arrays should have the same length as the rows Index: 2 vs 1",
270+
r"Length of values \(2\) does not match length of index \(1\)",
273271
]
274272
)
275273
with pytest.raises(ValueError, match=msg):

pandas/tests/frame/test_constructors.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -2222,8 +2222,7 @@ def test_construct_from_listlikes_mismatched_lengths(self):
22222222
# invalid (shape)
22232223
msg = "|".join(
22242224
[
2225-
r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)",
2226-
"Passed arrays should have the same length as the rows Index",
2225+
r"Length of values \(6\) does not match length of index \(3\)",
22272226
]
22282227
)
22292228
msg2 = "will be changed to match the behavior"
@@ -2791,6 +2790,17 @@ def test_construction_from_ndarray_datetimelike(self):
27912790
df = DataFrame(arr)
27922791
assert all(isinstance(arr, DatetimeArray) for arr in df._mgr.arrays)
27932792

2793+
def test_construction_from_ndarray_with_eadtype_mismatched_columns(self):
2794+
arr = np.random.randn(10, 2)
2795+
dtype = pd.array([2.0]).dtype
2796+
msg = r"len\(arrays\) must match len\(columns\)"
2797+
with pytest.raises(ValueError, match=msg):
2798+
DataFrame(arr, columns=["foo"], dtype=dtype)
2799+
2800+
arr2 = pd.array([2.0, 3.0, 4.0])
2801+
with pytest.raises(ValueError, match=msg):
2802+
DataFrame(arr2, columns=["foo", "bar"])
2803+
27942804

27952805
def get1(obj):
27962806
if isinstance(obj, Series):

pandas/tests/io/json/test_pandas.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -317,9 +317,7 @@ def test_roundtrip_mixed(self, request, orient, convert_axes, numpy):
317317
'"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}',
318318
"|".join(
319319
[
320-
r"Shape of passed values is \(3, 2\), indices imply \(2, 2\)",
321-
"Passed arrays should have the same length as the rows Index: "
322-
"3 vs 2 rows",
320+
r"Length of values \(3\) does not match length of index \(2\)",
323321
]
324322
),
325323
"split",

0 commit comments

Comments
 (0)