Skip to content

Commit 1183936

Browse files
jorisvandenbosschepmhatre1
authored andcommitted
Add low-level create_dataframe_from_blocks helper function (pandas-dev#58197)
1 parent 218c9e8 commit 1183936

File tree

5 files changed

+194
-2
lines changed

5 files changed

+194
-2
lines changed

pandas/api/internals.py

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import numpy as np
2+
3+
from pandas._typing import ArrayLike
4+
5+
from pandas import (
6+
DataFrame,
7+
Index,
8+
)
9+
from pandas.core.internals.api import _make_block
10+
from pandas.core.internals.managers import BlockManager as _BlockManager
11+
12+
13+
def create_dataframe_from_blocks(
14+
blocks: list[tuple[ArrayLike, np.ndarray]], index: Index, columns: Index
15+
) -> DataFrame:
16+
"""
17+
Low-level function to create a DataFrame from arrays as they are
18+
representing the block structure of the resulting DataFrame.
19+
20+
Attention: this is an advanced, low-level function that should only be
21+
used if you know that the below-mentioned assumptions are guaranteed.
22+
If passing data that do not follow those assumptions, subsequent
23+
subsequent operations on the resulting DataFrame might lead to strange
24+
errors.
25+
For almost all use cases, you should use the standard pd.DataFrame(..)
26+
constructor instead. If you are planning to use this function, let us
27+
know by opening an issue at https://github.com/pandas-dev/pandas/issues.
28+
29+
Assumptions:
30+
31+
- The block arrays are either a 2D numpy array or a pandas ExtensionArray
32+
- In case of a numpy array, it is assumed to already be in the expected
33+
shape for Blocks (2D, (cols, rows), i.e. transposed compared to the
34+
DataFrame columns).
35+
- All arrays are taken as is (no type inference) and expected to have the
36+
correct size.
37+
- The placement arrays have the correct length (equalling the number of
38+
columns that its equivalent block array represents), and all placement
39+
arrays together form a complete set of 0 to n_columns - 1.
40+
41+
Parameters
42+
----------
43+
blocks : list of tuples of (block_array, block_placement)
44+
This should be a list of tuples existing of (block_array, block_placement),
45+
where:
46+
47+
- block_array is a 2D numpy array or a 1D ExtensionArray, following the
48+
requirements listed above.
49+
- block_placement is a 1D integer numpy array
50+
index : Index
51+
The Index object for the `index` of the resulting DataFrame.
52+
columns : Index
53+
The Index object for the `columns` of the resulting DataFrame.
54+
55+
Returns
56+
-------
57+
DataFrame
58+
"""
59+
block_objs = [_make_block(*block) for block in blocks]
60+
axes = [columns, index]
61+
mgr = _BlockManager(block_objs, axes)
62+
return DataFrame._from_mgr(mgr, mgr.axes)

pandas/core/internals/api.py

+38-2
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,14 @@
1818
from pandas.core.dtypes.common import pandas_dtype
1919
from pandas.core.dtypes.dtypes import (
2020
DatetimeTZDtype,
21+
ExtensionDtype,
2122
PeriodDtype,
2223
)
2324

24-
from pandas.core.arrays import DatetimeArray
25+
from pandas.core.arrays import (
26+
DatetimeArray,
27+
TimedeltaArray,
28+
)
2529
from pandas.core.construction import extract_array
2630
from pandas.core.internals.blocks import (
2731
check_ndim,
@@ -32,11 +36,43 @@
3236
)
3337

3438
if TYPE_CHECKING:
35-
from pandas._typing import Dtype
39+
from pandas._typing import (
40+
ArrayLike,
41+
Dtype,
42+
)
3643

3744
from pandas.core.internals.blocks import Block
3845

3946

47+
def _make_block(values: ArrayLike, placement: np.ndarray) -> Block:
48+
"""
49+
This is an analogue to blocks.new_block(_2d) that ensures:
50+
1) correct dimension for EAs that support 2D (`ensure_block_shape`), and
51+
2) correct EA class for datetime64/timedelta64 (`maybe_coerce_values`).
52+
53+
The input `values` is assumed to be either numpy array or ExtensionArray:
54+
- In case of a numpy array, it is assumed to already be in the expected
55+
shape for Blocks (2D, (cols, rows)).
56+
- In case of an ExtensionArray the input can be 1D, also for EAs that are
57+
internally stored as 2D.
58+
59+
For the rest no preprocessing or validation is done, except for those dtypes
60+
that are internally stored as EAs but have an exact numpy equivalent (and at
61+
the moment use that numpy dtype), i.e. datetime64/timedelta64.
62+
"""
63+
dtype = values.dtype
64+
klass = get_block_type(dtype)
65+
placement_obj = BlockPlacement(placement)
66+
67+
if (isinstance(dtype, ExtensionDtype) and dtype._supports_2d) or isinstance(
68+
values, (DatetimeArray, TimedeltaArray)
69+
):
70+
values = ensure_block_shape(values, ndim=2)
71+
72+
values = maybe_coerce_values(values)
73+
return klass(values, ndim=2, placement=placement_obj)
74+
75+
4076
def make_block(
4177
values, placement, klass=None, ndim=None, dtype: Dtype | None = None
4278
) -> Block:

pandas/tests/api/test_api.py

+1
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ class TestApi(Base):
248248
"indexers",
249249
"interchange",
250250
"typing",
251+
"internals",
251252
]
252253
allowed_typing = [
253254
"DataFrameGroupBy",

pandas/tests/internals/test_api.py

+92
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,14 @@
33
in core.internals
44
"""
55

6+
import datetime
7+
8+
import numpy as np
69
import pytest
710

811
import pandas as pd
912
import pandas._testing as tm
13+
from pandas.api.internals import create_dataframe_from_blocks
1014
from pandas.core import internals
1115
from pandas.core.internals import api
1216

@@ -71,3 +75,91 @@ def test_create_block_manager_from_blocks_deprecated():
7175
)
7276
with tm.assert_produces_warning(DeprecationWarning, match=msg):
7377
internals.create_block_manager_from_blocks
78+
79+
80+
def test_create_dataframe_from_blocks(float_frame):
81+
block = float_frame._mgr.blocks[0]
82+
index = float_frame.index.copy()
83+
columns = float_frame.columns.copy()
84+
85+
result = create_dataframe_from_blocks(
86+
[(block.values, block.mgr_locs.as_array)], index=index, columns=columns
87+
)
88+
tm.assert_frame_equal(result, float_frame)
89+
90+
91+
def test_create_dataframe_from_blocks_types():
92+
df = pd.DataFrame(
93+
{
94+
"int": list(range(1, 4)),
95+
"uint": np.arange(3, 6).astype("uint8"),
96+
"float": [2.0, np.nan, 3.0],
97+
"bool": np.array([True, False, True]),
98+
"boolean": pd.array([True, False, None], dtype="boolean"),
99+
"string": list("abc"),
100+
"datetime": pd.date_range("20130101", periods=3),
101+
"datetimetz": pd.date_range("20130101", periods=3).tz_localize(
102+
"Europe/Brussels"
103+
),
104+
"timedelta": pd.timedelta_range("1 day", periods=3),
105+
"period": pd.period_range("2012-01-01", periods=3, freq="D"),
106+
"categorical": pd.Categorical(["a", "b", "a"]),
107+
"interval": pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)]),
108+
}
109+
)
110+
111+
result = create_dataframe_from_blocks(
112+
[(block.values, block.mgr_locs.as_array) for block in df._mgr.blocks],
113+
index=df.index,
114+
columns=df.columns,
115+
)
116+
tm.assert_frame_equal(result, df)
117+
118+
119+
def test_create_dataframe_from_blocks_datetimelike():
120+
# extension dtypes that have an exact matching numpy dtype can also be
121+
# be passed as a numpy array
122+
index, columns = pd.RangeIndex(3), pd.Index(["a", "b", "c", "d"])
123+
124+
block_array1 = np.arange(
125+
datetime.datetime(2020, 1, 1),
126+
datetime.datetime(2020, 1, 7),
127+
step=datetime.timedelta(1),
128+
).reshape((2, 3))
129+
block_array2 = np.arange(
130+
datetime.timedelta(1), datetime.timedelta(7), step=datetime.timedelta(1)
131+
).reshape((2, 3))
132+
result = create_dataframe_from_blocks(
133+
[(block_array1, np.array([0, 2])), (block_array2, np.array([1, 3]))],
134+
index=index,
135+
columns=columns,
136+
)
137+
expected = pd.DataFrame(
138+
{
139+
"a": pd.date_range("2020-01-01", periods=3, unit="us"),
140+
"b": pd.timedelta_range("1 days", periods=3, unit="us"),
141+
"c": pd.date_range("2020-01-04", periods=3, unit="us"),
142+
"d": pd.timedelta_range("4 days", periods=3, unit="us"),
143+
}
144+
)
145+
tm.assert_frame_equal(result, expected)
146+
147+
148+
@pytest.mark.parametrize(
149+
"array",
150+
[
151+
pd.date_range("2020-01-01", periods=3),
152+
pd.date_range("2020-01-01", periods=3, tz="UTC"),
153+
pd.period_range("2012-01-01", periods=3, freq="D"),
154+
pd.timedelta_range("1 day", periods=3),
155+
],
156+
)
157+
def test_create_dataframe_from_blocks_1dEA(array):
158+
# ExtensionArrays can be passed as 1D even if stored under the hood as 2D
159+
df = pd.DataFrame({"a": array})
160+
161+
block = df._mgr.blocks[0]
162+
result = create_dataframe_from_blocks(
163+
[(block.values[0], block.mgr_locs.as_array)], index=df.index, columns=df.columns
164+
)
165+
tm.assert_frame_equal(result, df)

scripts/validate_unwanted_patterns.py

+1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
# TODO(4.0): GH#55043 - remove upon removal of CoW option
5555
"_get_option",
5656
"_fill_limit_area_1d",
57+
"_make_block",
5758
}
5859

5960

0 commit comments

Comments
 (0)