Skip to content

Commit 6945116

Browse files
[ArrayManager] DataFrame constructors (#39991)
1 parent 1abbbcf commit 6945116

File tree

8 files changed

+121
-39
lines changed

8 files changed

+121
-39
lines changed

.github/workflows/ci.yml

+2
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,8 @@ jobs:
153153
run: |
154154
source activate pandas-dev
155155
pytest pandas/tests/frame/methods --array-manager
156+
pytest pandas/tests/frame/test_constructors.py --array-manager
157+
pytest pandas/tests/frame/constructors/ --array-manager
156158
pytest pandas/tests/frame/test_reductions.py --array-manager
157159
pytest pandas/tests/reductions/ --array-manager
158160
pytest pandas/tests/generic/test_generic.py --array-manager

pandas/core/frame.py

+42-18
Original file line numberDiff line numberDiff line change
@@ -563,39 +563,55 @@ def __init__(
563563
if isinstance(data, DataFrame):
564564
data = data._mgr
565565

566-
if isinstance(data, (BlockManager, ArrayManager)):
567-
if index is None and columns is None and dtype is None and copy is False:
568-
# GH#33357 fastpath
569-
NDFrame.__init__(self, data)
570-
return
566+
# first check if a Manager is passed without any other arguments
567+
# -> use fastpath (without checking Manager type)
568+
if (
569+
index is None
570+
and columns is None
571+
and dtype is None
572+
and copy is False
573+
and isinstance(data, (BlockManager, ArrayManager))
574+
):
575+
# GH#33357 fastpath
576+
NDFrame.__init__(self, data)
577+
return
571578

579+
manager = get_option("mode.data_manager")
580+
581+
if isinstance(data, (BlockManager, ArrayManager)):
572582
mgr = self._init_mgr(
573583
data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
574584
)
575585

576586
elif isinstance(data, dict):
577-
mgr = dict_to_mgr(data, index, columns, dtype=dtype)
587+
mgr = dict_to_mgr(data, index, columns, dtype=dtype, typ=manager)
578588
elif isinstance(data, ma.MaskedArray):
579589
import numpy.ma.mrecords as mrecords
580590

581591
# masked recarray
582592
if isinstance(data, mrecords.MaskedRecords):
583-
mgr = rec_array_to_mgr(data, index, columns, dtype, copy)
593+
mgr = rec_array_to_mgr(data, index, columns, dtype, copy, typ=manager)
584594

585595
# a masked array
586596
else:
587597
data = sanitize_masked_array(data)
588-
mgr = ndarray_to_mgr(data, index, columns, dtype=dtype, copy=copy)
598+
mgr = ndarray_to_mgr(
599+
data, index, columns, dtype=dtype, copy=copy, typ=manager
600+
)
589601

590602
elif isinstance(data, (np.ndarray, Series, Index)):
591603
if data.dtype.names:
592604
# i.e. numpy structured array
593-
mgr = rec_array_to_mgr(data, index, columns, dtype, copy)
605+
mgr = rec_array_to_mgr(data, index, columns, dtype, copy, typ=manager)
594606
elif getattr(data, "name", None) is not None:
595607
# i.e. Series/Index with non-None name
596-
mgr = dict_to_mgr({data.name: data}, index, columns, dtype=dtype)
608+
mgr = dict_to_mgr(
609+
{data.name: data}, index, columns, dtype=dtype, typ=manager
610+
)
597611
else:
598-
mgr = ndarray_to_mgr(data, index, columns, dtype=dtype, copy=copy)
612+
mgr = ndarray_to_mgr(
613+
data, index, columns, dtype=dtype, copy=copy, typ=manager
614+
)
599615

600616
# For data is list-like, or Iterable (will consume into list)
601617
elif is_list_like(data):
@@ -610,11 +626,15 @@ def __init__(
610626
arrays, columns, index = nested_data_to_arrays(
611627
data, columns, index, dtype
612628
)
613-
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
629+
mgr = arrays_to_mgr(
630+
arrays, columns, index, columns, dtype=dtype, typ=manager
631+
)
614632
else:
615-
mgr = ndarray_to_mgr(data, index, columns, dtype=dtype, copy=copy)
633+
mgr = ndarray_to_mgr(
634+
data, index, columns, dtype=dtype, copy=copy, typ=manager
635+
)
616636
else:
617-
mgr = dict_to_mgr({}, index, columns, dtype=dtype)
637+
mgr = dict_to_mgr({}, index, columns, dtype=dtype, typ=manager)
618638
# For data is scalar
619639
else:
620640
if index is None or columns is None:
@@ -631,18 +651,19 @@ def __init__(
631651
construct_1d_arraylike_from_scalar(data, len(index), dtype)
632652
for _ in range(len(columns))
633653
]
634-
mgr = arrays_to_mgr(values, columns, index, columns, dtype=None)
654+
mgr = arrays_to_mgr(
655+
values, columns, index, columns, dtype=None, typ=manager
656+
)
635657
else:
636658
values = construct_2d_arraylike_from_scalar(
637659
data, len(index), len(columns), dtype, copy
638660
)
639661

640662
mgr = ndarray_to_mgr(
641-
values, index, columns, dtype=values.dtype, copy=False
663+
values, index, columns, dtype=values.dtype, copy=False, typ=manager
642664
)
643665

644666
# ensure correct Manager type according to settings
645-
manager = get_option("mode.data_manager")
646667
mgr = mgr_to_mgr(mgr, typ=manager)
647668

648669
NDFrame.__init__(self, mgr)
@@ -1970,7 +1991,8 @@ def from_records(
19701991
arr_columns = arr_columns.drop(arr_exclude)
19711992
columns = columns.drop(exclude)
19721993

1973-
mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
1994+
manager = get_option("mode.data_manager")
1995+
mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns, typ=manager)
19741996

19751997
return cls(mgr)
19761998

@@ -2177,13 +2199,15 @@ def _from_arrays(
21772199
if dtype is not None:
21782200
dtype = pandas_dtype(dtype)
21792201

2202+
manager = get_option("mode.data_manager")
21802203
mgr = arrays_to_mgr(
21812204
arrays,
21822205
columns,
21832206
index,
21842207
columns,
21852208
dtype=dtype,
21862209
verify_integrity=verify_integrity,
2210+
typ=manager,
21872211
)
21882212
return cls(mgr)
21892213

pandas/core/generic.py

+3
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@
139139
ArrayManager,
140140
BlockManager,
141141
)
142+
from pandas.core.internals.construction import mgr_to_mgr
142143
from pandas.core.missing import find_valid_index
143144
from pandas.core.ops import align_method_FRAME
144145
from pandas.core.reshape.concat import concat
@@ -5755,6 +5756,8 @@ def _to_dict_of_blocks(self, copy: bool_t = True):
57555756
Internal ONLY - only works for BlockManager
57565757
"""
57575758
mgr = self._mgr
5759+
# convert to BlockManager if needed -> this way support ArrayManager as well
5760+
mgr = mgr_to_mgr(mgr, "block")
57585761
mgr = cast(BlockManager, mgr)
57595762
return {
57605763
k: self._constructor(v).__finalize__(self)

pandas/core/internals/array_manager.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -840,7 +840,13 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False
840840

841841
value = extract_array(value, extract_numpy=True)
842842
if value.ndim == 2:
843-
value = value[0, :]
843+
if value.shape[0] == 1:
844+
value = value[0, :]
845+
else:
846+
raise ValueError(
847+
f"Expected a 1D array, got an array with shape {value.shape}"
848+
)
849+
844850
# TODO self.arrays can be empty
845851
# assert len(value) == len(self.arrays[0])
846852

pandas/core/internals/construction.py

+24-13
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,9 @@
6969
get_objs_combined_axis,
7070
union_indexes,
7171
)
72+
from pandas.core.internals.array_manager import ArrayManager
7273
from pandas.core.internals.managers import (
74+
BlockManager,
7375
create_block_manager_from_arrays,
7476
create_block_manager_from_blocks,
7577
)
@@ -88,6 +90,7 @@ def arrays_to_mgr(
8890
columns,
8991
dtype: Optional[DtypeObj] = None,
9092
verify_integrity: bool = True,
93+
typ: Optional[str] = None,
9194
):
9295
"""
9396
Segregate Series based on type and coerce into matrices.
@@ -114,7 +117,12 @@ def arrays_to_mgr(
114117
# from BlockManager perspective
115118
axes = [columns, index]
116119

117-
return create_block_manager_from_arrays(arrays, arr_names, axes)
120+
if typ == "block":
121+
return create_block_manager_from_arrays(arrays, arr_names, axes)
122+
elif typ == "array":
123+
return ArrayManager(arrays, [index, columns])
124+
else:
125+
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
118126

119127

120128
def rec_array_to_mgr(
@@ -123,6 +131,7 @@ def rec_array_to_mgr(
123131
columns,
124132
dtype: Optional[DtypeObj],
125133
copy: bool,
134+
typ: str,
126135
):
127136
"""
128137
Extract from a masked rec array and create the manager.
@@ -150,7 +159,7 @@ def rec_array_to_mgr(
150159
if columns is None:
151160
columns = arr_columns
152161

153-
mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype)
162+
mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype, typ=typ)
154163

155164
if copy:
156165
mgr = mgr.copy()
@@ -180,19 +189,14 @@ def mgr_to_mgr(mgr, typ: str):
180189
Convert to specific type of Manager. Does not copy if the type is already
181190
correct. Does not guarantee a copy otherwise.
182191
"""
183-
from pandas.core.internals import (
184-
ArrayManager,
185-
BlockManager,
186-
)
187-
188192
new_mgr: Manager
189193

190194
if typ == "block":
191195
if isinstance(mgr, BlockManager):
192196
new_mgr = mgr
193197
else:
194198
new_mgr = arrays_to_mgr(
195-
mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], dtype=None
199+
mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], typ="block"
196200
)
197201
elif typ == "array":
198202
if isinstance(mgr, ArrayManager):
@@ -201,15 +205,17 @@ def mgr_to_mgr(mgr, typ: str):
201205
arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))]
202206
new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
203207
else:
204-
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{type}'")
208+
raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
205209
return new_mgr
206210

207211

208212
# ---------------------------------------------------------------------
209213
# DataFrame Constructor Interface
210214

211215

212-
def ndarray_to_mgr(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
216+
def ndarray_to_mgr(
217+
values, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str
218+
):
213219
# used in DataFrame.__init__
214220
# input must be a ndarray, list, Series, index
215221

@@ -239,7 +245,7 @@ def ndarray_to_mgr(values, index, columns, dtype: Optional[DtypeObj], copy: bool
239245
if columns is None:
240246
columns = Index(range(len(values)))
241247

242-
return arrays_to_mgr(values, columns, index, columns, dtype=dtype)
248+
return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ)
243249

244250
# by definition an array here
245251
# the dtypes will be coerced to a single dtype
@@ -303,7 +309,7 @@ def ndarray_to_mgr(values, index, columns, dtype: Optional[DtypeObj], copy: bool
303309
return create_block_manager_from_blocks(block_values, [columns, index])
304310

305311

306-
def dict_to_mgr(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
312+
def dict_to_mgr(data: Dict, index, columns, dtype: Optional[DtypeObj], typ: str):
307313
"""
308314
Segregate Series based on type and coerce into matrices.
309315
Needs to handle a lot of exceptional cases.
@@ -349,7 +355,7 @@ def dict_to_mgr(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
349355
arrays = [
350356
arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
351357
]
352-
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
358+
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ)
353359

354360

355361
def nested_data_to_arrays(
@@ -443,6 +449,11 @@ def _homogenize(data, index: Index, dtype: Optional[DtypeObj]):
443449
# Forces alignment. No need to copy data since we
444450
# are putting it into an ndarray later
445451
val = val.reindex(index, copy=False)
452+
# TODO extract_array should be preferred, but that gives failures for
453+
# `extension/test_numpy.py` (extract_array will convert numpy arrays
454+
# to PandasArray), see https://github.com/pandas-dev/pandas/issues/40021
455+
# val = extract_array(val, extract_numpy=True)
456+
val = val._values
446457
else:
447458
if isinstance(val, dict):
448459
if oindex is None:

pandas/tests/frame/constructors/test_from_records.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pytz
77

88
from pandas.compat import is_platform_little_endian
9+
import pandas.util._test_decorators as td
910

1011
from pandas import (
1112
CategoricalIndex,
@@ -119,6 +120,8 @@ def test_from_records_sequencelike(self):
119120
tm.assert_series_equal(result["C"], df["C"])
120121
tm.assert_series_equal(result["E1"], df["E1"].astype("float64"))
121122

123+
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
124+
def test_from_records_sequencelike_empty(self):
122125
# empty case
123126
result = DataFrame.from_records([], columns=["foo", "bar", "baz"])
124127
assert len(result) == 0
@@ -185,7 +188,12 @@ def test_from_records_bad_index_column(self):
185188
tm.assert_index_equal(df1.index, Index(df.C))
186189

187190
# should fail
188-
msg = r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)"
191+
msg = "|".join(
192+
[
193+
r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)",
194+
"Passed arrays should have the same length as the rows Index: 10 vs 1",
195+
]
196+
)
189197
with pytest.raises(ValueError, match=msg):
190198
DataFrame.from_records(df, index=[2])
191199
with pytest.raises(KeyError, match=r"^2$"):
@@ -209,6 +217,7 @@ def __iter__(self):
209217
expected = DataFrame.from_records(tups)
210218
tm.assert_frame_equal(result, expected)
211219

220+
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
212221
def test_from_records_len0_with_columns(self):
213222
# GH#2633
214223
result = DataFrame.from_records([], index="foo", columns=["foo", "bar"])
@@ -260,7 +269,12 @@ def test_from_records_to_records(self):
260269
tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2))
261270

262271
# wrong length
263-
msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
272+
msg = "|".join(
273+
[
274+
r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)",
275+
"Passed arrays should have the same length as the rows Index: 2 vs 1",
276+
]
277+
)
264278
with pytest.raises(ValueError, match=msg):
265279
DataFrame.from_records(arr, index=index[:-1])
266280

@@ -387,6 +401,7 @@ def create_dict(order_id):
387401
result = DataFrame.from_records(documents, index=["order_id", "quantity"])
388402
assert result.index.names == ("order_id", "quantity")
389403

404+
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
390405
def test_from_records_misc_brokenness(self):
391406
# GH#2179
392407

@@ -425,6 +440,7 @@ def test_from_records_misc_brokenness(self):
425440
)
426441
tm.assert_series_equal(result, expected)
427442

443+
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records
428444
def test_from_records_empty(self):
429445
# GH#3562
430446
result = DataFrame.from_records([], columns=["a", "b", "c"])

0 commit comments

Comments
 (0)