diff --git a/pandas/api/internals.py b/pandas/api/internals.py new file mode 100644 index 0000000000000..03d8992a87575 --- /dev/null +++ b/pandas/api/internals.py @@ -0,0 +1,62 @@ +import numpy as np + +from pandas._typing import ArrayLike + +from pandas import ( + DataFrame, + Index, +) +from pandas.core.internals.api import _make_block +from pandas.core.internals.managers import BlockManager as _BlockManager + + +def create_dataframe_from_blocks( + blocks: list[tuple[ArrayLike, np.ndarray]], index: Index, columns: Index +) -> DataFrame: + """ + Low-level function to create a DataFrame from arrays as they are + representing the block structure of the resulting DataFrame. + + Attention: this is an advanced, low-level function that should only be + used if you know that the below-mentioned assumptions are guaranteed. + If passing data that do not follow those assumptions, subsequent + subsequent operations on the resulting DataFrame might lead to strange + errors. + For almost all use cases, you should use the standard pd.DataFrame(..) + constructor instead. If you are planning to use this function, let us + know by opening an issue at https://github.com/pandas-dev/pandas/issues. + + Assumptions: + + - The block arrays are either a 2D numpy array or a pandas ExtensionArray + - In case of a numpy array, it is assumed to already be in the expected + shape for Blocks (2D, (cols, rows), i.e. transposed compared to the + DataFrame columns). + - All arrays are taken as is (no type inference) and expected to have the + correct size. + - The placement arrays have the correct length (equalling the number of + columns that its equivalent block array represents), and all placement + arrays together form a complete set of 0 to n_columns - 1. + + Parameters + ---------- + blocks : list of tuples of (block_array, block_placement) + This should be a list of tuples existing of (block_array, block_placement), + where: + + - block_array is a 2D numpy array or a 1D ExtensionArray, following the + requirements listed above. + - block_placement is a 1D integer numpy array + index : Index + The Index object for the `index` of the resulting DataFrame. + columns : Index + The Index object for the `columns` of the resulting DataFrame. + + Returns + ------- + DataFrame + """ + block_objs = [_make_block(*block) for block in blocks] + axes = [columns, index] + mgr = _BlockManager(block_objs, axes) + return DataFrame._from_mgr(mgr, mgr.axes) diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index d6e1e8b38dfe3..ef25d7ed5ae9e 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -18,10 +18,14 @@ from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, + ExtensionDtype, PeriodDtype, ) -from pandas.core.arrays import DatetimeArray +from pandas.core.arrays import ( + DatetimeArray, + TimedeltaArray, +) from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( check_ndim, @@ -32,11 +36,43 @@ ) if TYPE_CHECKING: - from pandas._typing import Dtype + from pandas._typing import ( + ArrayLike, + Dtype, + ) from pandas.core.internals.blocks import Block +def _make_block(values: ArrayLike, placement: np.ndarray) -> Block: + """ + This is an analogue to blocks.new_block(_2d) that ensures: + 1) correct dimension for EAs that support 2D (`ensure_block_shape`), and + 2) correct EA class for datetime64/timedelta64 (`maybe_coerce_values`). + + The input `values` is assumed to be either numpy array or ExtensionArray: + - In case of a numpy array, it is assumed to already be in the expected + shape for Blocks (2D, (cols, rows)). + - In case of an ExtensionArray the input can be 1D, also for EAs that are + internally stored as 2D. + + For the rest no preprocessing or validation is done, except for those dtypes + that are internally stored as EAs but have an exact numpy equivalent (and at + the moment use that numpy dtype), i.e. datetime64/timedelta64. + """ + dtype = values.dtype + klass = get_block_type(dtype) + placement_obj = BlockPlacement(placement) + + if (isinstance(dtype, ExtensionDtype) and dtype._supports_2d) or isinstance( + values, (DatetimeArray, TimedeltaArray) + ): + values = ensure_block_shape(values, ndim=2) + + values = maybe_coerce_values(values) + return klass(values, ndim=2, placement=placement_obj) + + def make_block( values, placement, klass=None, ndim=None, dtype: Dtype | None = None ) -> Block: diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index e32e5a268d46d..5ee9932fbd051 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -248,6 +248,7 @@ class TestApi(Base): "indexers", "interchange", "typing", + "internals", ] allowed_typing = [ "DataFrameGroupBy", diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index 5bff1b7be3080..7ab8988521fdf 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -3,10 +3,14 @@ in core.internals """ +import datetime + +import numpy as np import pytest import pandas as pd import pandas._testing as tm +from pandas.api.internals import create_dataframe_from_blocks from pandas.core import internals from pandas.core.internals import api @@ -71,3 +75,91 @@ def test_create_block_manager_from_blocks_deprecated(): ) with tm.assert_produces_warning(DeprecationWarning, match=msg): internals.create_block_manager_from_blocks + + +def test_create_dataframe_from_blocks(float_frame): + block = float_frame._mgr.blocks[0] + index = float_frame.index.copy() + columns = float_frame.columns.copy() + + result = create_dataframe_from_blocks( + [(block.values, block.mgr_locs.as_array)], index=index, columns=columns + ) + tm.assert_frame_equal(result, float_frame) + + +def test_create_dataframe_from_blocks_types(): + df = pd.DataFrame( + { + "int": list(range(1, 4)), + "uint": np.arange(3, 6).astype("uint8"), + "float": [2.0, np.nan, 3.0], + "bool": np.array([True, False, True]), + "boolean": pd.array([True, False, None], dtype="boolean"), + "string": list("abc"), + "datetime": pd.date_range("20130101", periods=3), + "datetimetz": pd.date_range("20130101", periods=3).tz_localize( + "Europe/Brussels" + ), + "timedelta": pd.timedelta_range("1 day", periods=3), + "period": pd.period_range("2012-01-01", periods=3, freq="D"), + "categorical": pd.Categorical(["a", "b", "a"]), + "interval": pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)]), + } + ) + + result = create_dataframe_from_blocks( + [(block.values, block.mgr_locs.as_array) for block in df._mgr.blocks], + index=df.index, + columns=df.columns, + ) + tm.assert_frame_equal(result, df) + + +def test_create_dataframe_from_blocks_datetimelike(): + # extension dtypes that have an exact matching numpy dtype can also be + # be passed as a numpy array + index, columns = pd.RangeIndex(3), pd.Index(["a", "b", "c", "d"]) + + block_array1 = np.arange( + datetime.datetime(2020, 1, 1), + datetime.datetime(2020, 1, 7), + step=datetime.timedelta(1), + ).reshape((2, 3)) + block_array2 = np.arange( + datetime.timedelta(1), datetime.timedelta(7), step=datetime.timedelta(1) + ).reshape((2, 3)) + result = create_dataframe_from_blocks( + [(block_array1, np.array([0, 2])), (block_array2, np.array([1, 3]))], + index=index, + columns=columns, + ) + expected = pd.DataFrame( + { + "a": pd.date_range("2020-01-01", periods=3, unit="us"), + "b": pd.timedelta_range("1 days", periods=3, unit="us"), + "c": pd.date_range("2020-01-04", periods=3, unit="us"), + "d": pd.timedelta_range("4 days", periods=3, unit="us"), + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "array", + [ + pd.date_range("2020-01-01", periods=3), + pd.date_range("2020-01-01", periods=3, tz="UTC"), + pd.period_range("2012-01-01", periods=3, freq="D"), + pd.timedelta_range("1 day", periods=3), + ], +) +def test_create_dataframe_from_blocks_1dEA(array): + # ExtensionArrays can be passed as 1D even if stored under the hood as 2D + df = pd.DataFrame({"a": array}) + + block = df._mgr.blocks[0] + result = create_dataframe_from_blocks( + [(block.values[0], block.mgr_locs.as_array)], index=df.index, columns=df.columns + ) + tm.assert_frame_equal(result, df) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index a732d3f83a40a..ba3123a07df4b 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -54,6 +54,7 @@ # TODO(4.0): GH#55043 - remove upon removal of CoW option "_get_option", "_fill_limit_area_1d", + "_make_block", }