Skip to content

Commit 5600a2f

Browse files
authored
PERF: _form_blocks (#43144)
1 parent 4c721c9 commit 5600a2f

File tree

2 files changed

+50
-55
lines changed

2 files changed

+50
-55
lines changed

pandas/core/internals/managers.py

+47-53
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
from __future__ import annotations
22

3-
from collections import defaultdict
43
import itertools
54
from typing import (
65
Any,
76
Callable,
8-
DefaultDict,
97
Hashable,
108
Sequence,
119
TypeVar,
@@ -67,9 +65,7 @@
6765
)
6866
from pandas.core.internals.blocks import (
6967
Block,
70-
CategoricalBlock,
7168
DatetimeTZBlock,
72-
ExtensionBlock,
7369
ensure_block_shape,
7470
extend_blocks,
7571
get_block_type,
@@ -1863,63 +1859,56 @@ def construction_error(
18631859
# -----------------------------------------------------------------------
18641860

18651861

1866-
def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]:
1867-
1868-
items_dict: DefaultDict[str, list] = defaultdict(list)
1869-
1870-
for i, name_idx in enumerate(range(len(arrays))):
1862+
def _grouping_func(tup):
1863+
# compat for numpy<1.21, in which comparing a np.dtype with an ExtensionDtype
1864+
# raises instead of returning False. Once earlier numpy versions are dropped,
1865+
# this can be simplified to `return tup[1].dtype`
1866+
dtype = tup[1].dtype
1867+
return isinstance(dtype, np.dtype), dtype
18711868

1872-
v = arrays[name_idx]
18731869

1874-
block_type = get_block_type(v)
1875-
items_dict[block_type.__name__].append((i, v))
1870+
def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]:
1871+
tuples = list(enumerate(arrays))
18761872

1877-
blocks: list[Block] = []
1878-
if len(items_dict["NumericBlock"]):
1879-
numeric_blocks = multi_blockify(
1880-
items_dict["NumericBlock"], consolidate=consolidate
1881-
)
1882-
blocks.extend(numeric_blocks)
1873+
if not consolidate:
1874+
nbs = _tuples_to_blocks_no_consolidate(tuples, dtype=None)
1875+
return nbs
18831876

1884-
if len(items_dict["DatetimeLikeBlock"]):
1885-
dtlike_blocks = multi_blockify(
1886-
items_dict["DatetimeLikeBlock"], consolidate=consolidate
1887-
)
1888-
blocks.extend(dtlike_blocks)
1877+
# group by dtype
1878+
grouper = itertools.groupby(tuples, _grouping_func)
18891879

1890-
if len(items_dict["DatetimeTZBlock"]):
1891-
dttz_blocks = [
1892-
DatetimeTZBlock(
1893-
ensure_block_shape(extract_array(array), 2),
1894-
placement=BlockPlacement(i),
1895-
ndim=2,
1896-
)
1897-
for i, array in items_dict["DatetimeTZBlock"]
1898-
]
1899-
blocks.extend(dttz_blocks)
1880+
nbs = []
1881+
for (_, dtype), tup_block in grouper:
1882+
block_type = get_block_type(None, dtype)
19001883

1901-
if len(items_dict["ObjectBlock"]) > 0:
1902-
object_blocks = simple_blockify(
1903-
items_dict["ObjectBlock"], np.object_, consolidate=consolidate
1904-
)
1905-
blocks.extend(object_blocks)
1884+
if isinstance(dtype, np.dtype):
1885+
is_dtlike = dtype.kind in ["m", "M"]
19061886

1907-
if len(items_dict["CategoricalBlock"]) > 0:
1908-
cat_blocks = [
1909-
CategoricalBlock(array, placement=BlockPlacement(i), ndim=2)
1910-
for i, array in items_dict["CategoricalBlock"]
1911-
]
1912-
blocks.extend(cat_blocks)
1887+
if issubclass(dtype.type, (str, bytes)):
1888+
dtype = np.dtype(object)
19131889

1914-
if len(items_dict["ExtensionBlock"]):
1915-
external_blocks = [
1916-
ExtensionBlock(array, placement=BlockPlacement(i), ndim=2)
1917-
for i, array in items_dict["ExtensionBlock"]
1918-
]
1890+
values, placement = _stack_arrays(list(tup_block), dtype)
1891+
if is_dtlike:
1892+
values = ensure_wrapped_if_datetimelike(values)
1893+
blk = block_type(values, placement=BlockPlacement(placement), ndim=2)
1894+
nbs.append(blk)
19191895

1920-
blocks.extend(external_blocks)
1896+
elif is_1d_only_ea_dtype(dtype):
1897+
dtype_blocks = [
1898+
block_type(x[1], placement=BlockPlacement(x[0]), ndim=2)
1899+
for x in tup_block
1900+
]
1901+
nbs.extend(dtype_blocks)
19211902

1922-
return blocks
1903+
else:
1904+
dtype_blocks = [
1905+
block_type(
1906+
ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2
1907+
)
1908+
for x in tup_block
1909+
]
1910+
nbs.extend(dtype_blocks)
1911+
return nbs
19231912

19241913

19251914
def simple_blockify(tuples, dtype, consolidate: bool) -> list[Block]:
@@ -1970,11 +1959,16 @@ def _tuples_to_blocks_no_consolidate(tuples, dtype: DtypeObj | None) -> list[Blo
19701959
if dtype is not None:
19711960
return [
19721961
new_block(
1973-
np.atleast_2d(x[1].astype(dtype, copy=False)), placement=x[0], ndim=2
1962+
ensure_block_shape(x[1].astype(dtype, copy=False), ndim=2),
1963+
placement=x[0],
1964+
ndim=2,
19741965
)
19751966
for x in tuples
19761967
]
1977-
return [new_block(np.atleast_2d(x[1]), placement=x[0], ndim=2) for x in tuples]
1968+
return [
1969+
new_block(ensure_block_shape(x[1], ndim=2), placement=x[0], ndim=2)
1970+
for x in tuples
1971+
]
19781972

19791973

19801974
def _stack_arrays(tuples, dtype: np.dtype):

pandas/tests/io/pytables/test_categorical.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,9 @@ def test_categorical(setup_path):
8383
# Make sure the metadata is OK
8484
info = store.info()
8585
assert "/df2 " in info
86-
# assert '/df2/meta/values_block_0/meta' in info
87-
assert "/df2/meta/values_block_1/meta" in info
86+
# df2._mgr.blocks[0] and df2._mgr.blocks[2] are Categorical
87+
assert "/df2/meta/values_block_0/meta" in info
88+
assert "/df2/meta/values_block_2/meta" in info
8889

8990
# unordered
9091
_maybe_remove(store, "s2")

0 commit comments

Comments
 (0)