Skip to content

Commit 9d6f8fe

Browse files
authored
PERF: DataFrame construction (pandas-dev#42631)
1 parent 3759e7d commit 9d6f8fe

File tree

3 files changed

+29
-24
lines changed

3 files changed

+29
-24
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ Performance improvements
164164
~~~~~~~~~~~~~~~~~~~~~~~~
165165
- Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`)
166166
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
167+
- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)
167168

168169
.. ---------------------------------------------------------------------------
169170

pandas/core/internals/blocks.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
from pandas._libs.internals import BlockPlacement
2626
from pandas._typing import (
2727
ArrayLike,
28-
Dtype,
2928
DtypeObj,
3029
F,
3130
Shape,
@@ -52,7 +51,6 @@
5251
is_list_like,
5352
is_sparse,
5453
is_string_dtype,
55-
pandas_dtype,
5654
)
5755
from pandas.core.dtypes.dtypes import (
5856
CategoricalDtype,
@@ -100,6 +98,7 @@
10098
TimedeltaArray,
10199
)
102100
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
101+
from pandas.core.arrays.sparse import SparseDtype
103102
from pandas.core.base import PandasObject
104103
import pandas.core.common as com
105104
import pandas.core.computation.expressions as expressions
@@ -326,7 +325,7 @@ def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block:
326325

327326
return type(self)(new_values, new_mgr_locs, self.ndim)
328327

329-
@property
328+
@cache_readonly
330329
def shape(self) -> Shape:
331330
return self.values.shape
332331

@@ -1842,7 +1841,7 @@ class CategoricalBlock(ExtensionBlock):
18421841
# Constructor Helpers
18431842

18441843

1845-
def maybe_coerce_values(values) -> ArrayLike:
1844+
def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
18461845
"""
18471846
Input validation for values passed to __init__. Ensure that
18481847
any datetime64/timedelta64 dtypes are in nanoseconds. Ensure
@@ -1874,7 +1873,7 @@ def maybe_coerce_values(values) -> ArrayLike:
18741873
return values
18751874

18761875

1877-
def get_block_type(values, dtype: Dtype | None = None):
1876+
def get_block_type(values: ArrayLike, dtype: DtypeObj | None = None):
18781877
"""
18791878
Find the appropriate Block subclass to use for the given values and dtype.
18801879
@@ -1889,13 +1888,14 @@ def get_block_type(values, dtype: Dtype | None = None):
18891888
"""
18901889
# We use vtype and kind checks because they are much more performant
18911890
# than is_foo_dtype
1892-
dtype = cast(np.dtype, pandas_dtype(dtype) if dtype else values.dtype)
1891+
if dtype is None:
1892+
dtype = values.dtype
18931893
vtype = dtype.type
18941894
kind = dtype.kind
18951895

18961896
cls: type[Block]
18971897

1898-
if is_sparse(dtype):
1898+
if isinstance(dtype, SparseDtype):
18991899
# Need this first(ish) so that Sparse[datetime] is sparse
19001900
cls = ExtensionBlock
19011901
elif isinstance(dtype, CategoricalDtype):

pandas/core/internals/managers.py

+21-17
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ class BaseBlockManager(DataManager):
148148
_known_consolidated: bool
149149
_is_consolidated: bool
150150

151-
def __init__(self, blocks, axes, verify_integrity=True):
151+
def __init__(self, blocks, axes, verify_integrity: bool = True):
152152
raise NotImplementedError
153153

154154
@classmethod
@@ -889,7 +889,8 @@ def __init__(
889889
):
890890

891891
if verify_integrity:
892-
assert all(isinstance(x, Index) for x in axes)
892+
# Assertion disabled for performance
893+
# assert all(isinstance(x, Index) for x in axes)
893894

894895
for block in blocks:
895896
if self.ndim != block.ndim:
@@ -1558,8 +1559,9 @@ def __init__(
15581559
verify_integrity: bool = False,
15591560
fastpath=lib.no_default,
15601561
):
1561-
assert isinstance(block, Block), type(block)
1562-
assert isinstance(axis, Index), type(axis)
1562+
# Assertions disabled for performance
1563+
# assert isinstance(block, Block), type(block)
1564+
# assert isinstance(axis, Index), type(axis)
15631565

15641566
if fastpath is not lib.no_default:
15651567
warnings.warn(
@@ -1660,7 +1662,8 @@ def getitem_mgr(self, indexer) -> SingleBlockManager:
16601662
return type(self)(block, new_idx)
16611663

16621664
def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager:
1663-
assert isinstance(slobj, slice), type(slobj)
1665+
# Assertion disabled for performance
1666+
# assert isinstance(slobj, slice), type(slobj)
16641667
if axis >= self.ndim:
16651668
raise IndexError("Requested axis not found in manager")
16661669

@@ -1778,9 +1781,10 @@ def create_block_manager_from_arrays(
17781781
axes: list[Index],
17791782
consolidate: bool = True,
17801783
) -> BlockManager:
1781-
assert isinstance(names, Index)
1782-
assert isinstance(axes, list)
1783-
assert all(isinstance(x, Index) for x in axes)
1784+
# Assertions disabled for performance
1785+
# assert isinstance(names, Index)
1786+
# assert isinstance(axes, list)
1787+
# assert all(isinstance(x, Index) for x in axes)
17841788

17851789
arrays = [_extract_array(x) for x in arrays]
17861790

@@ -1835,7 +1839,8 @@ def _form_blocks(
18351839
if names_idx.equals(axes[0]):
18361840
names_indexer = np.arange(len(names_idx))
18371841
else:
1838-
assert names_idx.intersection(axes[0]).is_unique
1842+
# Assertion disabled for performance
1843+
# assert names_idx.intersection(axes[0]).is_unique
18391844
names_indexer = names_idx.get_indexer_for(axes[0])
18401845

18411846
for i, name_idx in enumerate(names_indexer):
@@ -1863,10 +1868,9 @@ def _form_blocks(
18631868

18641869
if len(items_dict["DatetimeTZBlock"]):
18651870
dttz_blocks = [
1866-
new_block(
1871+
DatetimeTZBlock(
18671872
ensure_block_shape(extract_array(array), 2),
1868-
klass=DatetimeTZBlock,
1869-
placement=i,
1873+
placement=BlockPlacement(i),
18701874
ndim=2,
18711875
)
18721876
for i, array in items_dict["DatetimeTZBlock"]
@@ -1881,14 +1885,14 @@ def _form_blocks(
18811885

18821886
if len(items_dict["CategoricalBlock"]) > 0:
18831887
cat_blocks = [
1884-
new_block(array, klass=CategoricalBlock, placement=i, ndim=2)
1888+
CategoricalBlock(array, placement=BlockPlacement(i), ndim=2)
18851889
for i, array in items_dict["CategoricalBlock"]
18861890
]
18871891
blocks.extend(cat_blocks)
18881892

18891893
if len(items_dict["ExtensionBlock"]):
18901894
external_blocks = [
1891-
new_block(array, klass=ExtensionBlock, placement=i, ndim=2)
1895+
ExtensionBlock(array, placement=BlockPlacement(i), ndim=2)
18921896
for i, array in items_dict["ExtensionBlock"]
18931897
]
18941898

@@ -1921,7 +1925,7 @@ def _simple_blockify(tuples, dtype, consolidate: bool) -> list[Block]:
19211925
if dtype is not None and values.dtype != dtype: # pragma: no cover
19221926
values = values.astype(dtype)
19231927

1924-
block = new_block(values, placement=placement, ndim=2)
1928+
block = new_block(values, placement=BlockPlacement(placement), ndim=2)
19251929
return [block]
19261930

19271931

@@ -1944,14 +1948,14 @@ def _multi_blockify(tuples, dtype: DtypeObj | None = None, consolidate: bool = T
19441948
list(tup_block), dtype # type: ignore[arg-type]
19451949
)
19461950

1947-
block = new_block(values, placement=placement, ndim=2)
1951+
block = new_block(values, placement=BlockPlacement(placement), ndim=2)
19481952
new_blocks.append(block)
19491953

19501954
return new_blocks
19511955

19521956

19531957
def _tuples_to_blocks_no_consolidate(tuples, dtype: DtypeObj | None) -> list[Block]:
1954-
# tuples produced within _form_blocks are of the form (placement, whatever, array)
1958+
# tuples produced within _form_blocks are of the form (placement, array)
19551959
if dtype is not None:
19561960
return [
19571961
new_block(

0 commit comments

Comments
 (0)