Skip to content

Commit 37d9306

Browse files
Revert "PERF: DataFrame construction (pandas-dev#42631)" (pandas-dev#42698)
This reverts commit 9d6f8fe.
1 parent baf9e4b commit 37d9306

File tree

3 files changed

+24
-29
lines changed

3 files changed

+24
-29
lines changed

doc/source/whatsnew/v1.4.0.rst

-1
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,6 @@ Performance improvements
165165
~~~~~~~~~~~~~~~~~~~~~~~~
166166
- Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`)
167167
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
168-
- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)
169168

170169
.. ---------------------------------------------------------------------------
171170

pandas/core/internals/blocks.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from pandas._libs.internals import BlockPlacement
2626
from pandas._typing import (
2727
ArrayLike,
28+
Dtype,
2829
DtypeObj,
2930
F,
3031
Shape,
@@ -51,6 +52,7 @@
5152
is_list_like,
5253
is_sparse,
5354
is_string_dtype,
55+
pandas_dtype,
5456
)
5557
from pandas.core.dtypes.dtypes import (
5658
CategoricalDtype,
@@ -98,7 +100,6 @@
98100
TimedeltaArray,
99101
)
100102
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
101-
from pandas.core.arrays.sparse import SparseDtype
102103
from pandas.core.base import PandasObject
103104
import pandas.core.common as com
104105
import pandas.core.computation.expressions as expressions
@@ -325,7 +326,7 @@ def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block:
325326

326327
return type(self)(new_values, new_mgr_locs, self.ndim)
327328

328-
@cache_readonly
329+
@property
329330
def shape(self) -> Shape:
330331
return self.values.shape
331332

@@ -1841,7 +1842,7 @@ class CategoricalBlock(ExtensionBlock):
18411842
# Constructor Helpers
18421843

18431844

1844-
def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
1845+
def maybe_coerce_values(values) -> ArrayLike:
18451846
"""
18461847
Input validation for values passed to __init__. Ensure that
18471848
any datetime64/timedelta64 dtypes are in nanoseconds. Ensure
@@ -1873,7 +1874,7 @@ def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
18731874
return values
18741875

18751876

1876-
def get_block_type(values: ArrayLike, dtype: DtypeObj | None = None):
1877+
def get_block_type(values, dtype: Dtype | None = None):
18771878
"""
18781879
Find the appropriate Block subclass to use for the given values and dtype.
18791880
@@ -1888,14 +1889,13 @@ def get_block_type(values: ArrayLike, dtype: DtypeObj | None = None):
18881889
"""
18891890
# We use vtype and kind checks because they are much more performant
18901891
# than is_foo_dtype
1891-
if dtype is None:
1892-
dtype = values.dtype
1892+
dtype = cast(np.dtype, pandas_dtype(dtype) if dtype else values.dtype)
18931893
vtype = dtype.type
18941894
kind = dtype.kind
18951895

18961896
cls: type[Block]
18971897

1898-
if isinstance(dtype, SparseDtype):
1898+
if is_sparse(dtype):
18991899
# Need this first(ish) so that Sparse[datetime] is sparse
19001900
cls = ExtensionBlock
19011901
elif isinstance(dtype, CategoricalDtype):

pandas/core/internals/managers.py

+17-21
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ class BaseBlockManager(DataManager):
148148
_known_consolidated: bool
149149
_is_consolidated: bool
150150

151-
def __init__(self, blocks, axes, verify_integrity: bool = True):
151+
def __init__(self, blocks, axes, verify_integrity=True):
152152
raise NotImplementedError
153153

154154
@classmethod
@@ -889,8 +889,7 @@ def __init__(
889889
):
890890

891891
if verify_integrity:
892-
# Assertion disabled for performance
893-
# assert all(isinstance(x, Index) for x in axes)
892+
assert all(isinstance(x, Index) for x in axes)
894893

895894
for block in blocks:
896895
if self.ndim != block.ndim:
@@ -1559,9 +1558,8 @@ def __init__(
15591558
verify_integrity: bool = False,
15601559
fastpath=lib.no_default,
15611560
):
1562-
# Assertions disabled for performance
1563-
# assert isinstance(block, Block), type(block)
1564-
# assert isinstance(axis, Index), type(axis)
1561+
assert isinstance(block, Block), type(block)
1562+
assert isinstance(axis, Index), type(axis)
15651563

15661564
if fastpath is not lib.no_default:
15671565
warnings.warn(
@@ -1662,8 +1660,7 @@ def getitem_mgr(self, indexer) -> SingleBlockManager:
16621660
return type(self)(block, new_idx)
16631661

16641662
def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager:
1665-
# Assertion disabled for performance
1666-
# assert isinstance(slobj, slice), type(slobj)
1663+
assert isinstance(slobj, slice), type(slobj)
16671664
if axis >= self.ndim:
16681665
raise IndexError("Requested axis not found in manager")
16691666

@@ -1781,10 +1778,9 @@ def create_block_manager_from_arrays(
17811778
axes: list[Index],
17821779
consolidate: bool = True,
17831780
) -> BlockManager:
1784-
# Assertions disabled for performance
1785-
# assert isinstance(names, Index)
1786-
# assert isinstance(axes, list)
1787-
# assert all(isinstance(x, Index) for x in axes)
1781+
assert isinstance(names, Index)
1782+
assert isinstance(axes, list)
1783+
assert all(isinstance(x, Index) for x in axes)
17881784

17891785
arrays = [_extract_array(x) for x in arrays]
17901786

@@ -1839,8 +1835,7 @@ def _form_blocks(
18391835
if names_idx.equals(axes[0]):
18401836
names_indexer = np.arange(len(names_idx))
18411837
else:
1842-
# Assertion disabled for performance
1843-
# assert names_idx.intersection(axes[0]).is_unique
1838+
assert names_idx.intersection(axes[0]).is_unique
18441839
names_indexer = names_idx.get_indexer_for(axes[0])
18451840

18461841
for i, name_idx in enumerate(names_indexer):
@@ -1868,9 +1863,10 @@ def _form_blocks(
18681863

18691864
if len(items_dict["DatetimeTZBlock"]):
18701865
dttz_blocks = [
1871-
DatetimeTZBlock(
1866+
new_block(
18721867
ensure_block_shape(extract_array(array), 2),
1873-
placement=BlockPlacement(i),
1868+
klass=DatetimeTZBlock,
1869+
placement=i,
18741870
ndim=2,
18751871
)
18761872
for i, array in items_dict["DatetimeTZBlock"]
@@ -1885,14 +1881,14 @@ def _form_blocks(
18851881

18861882
if len(items_dict["CategoricalBlock"]) > 0:
18871883
cat_blocks = [
1888-
CategoricalBlock(array, placement=BlockPlacement(i), ndim=2)
1884+
new_block(array, klass=CategoricalBlock, placement=i, ndim=2)
18891885
for i, array in items_dict["CategoricalBlock"]
18901886
]
18911887
blocks.extend(cat_blocks)
18921888

18931889
if len(items_dict["ExtensionBlock"]):
18941890
external_blocks = [
1895-
ExtensionBlock(array, placement=BlockPlacement(i), ndim=2)
1891+
new_block(array, klass=ExtensionBlock, placement=i, ndim=2)
18961892
for i, array in items_dict["ExtensionBlock"]
18971893
]
18981894

@@ -1925,7 +1921,7 @@ def _simple_blockify(tuples, dtype, consolidate: bool) -> list[Block]:
19251921
if dtype is not None and values.dtype != dtype: # pragma: no cover
19261922
values = values.astype(dtype)
19271923

1928-
block = new_block(values, placement=BlockPlacement(placement), ndim=2)
1924+
block = new_block(values, placement=placement, ndim=2)
19291925
return [block]
19301926

19311927

@@ -1948,14 +1944,14 @@ def _multi_blockify(tuples, dtype: DtypeObj | None = None, consolidate: bool = T
19481944
list(tup_block), dtype # type: ignore[arg-type]
19491945
)
19501946

1951-
block = new_block(values, placement=BlockPlacement(placement), ndim=2)
1947+
block = new_block(values, placement=placement, ndim=2)
19521948
new_blocks.append(block)
19531949

19541950
return new_blocks
19551951

19561952

19571953
def _tuples_to_blocks_no_consolidate(tuples, dtype: DtypeObj | None) -> list[Block]:
1958-
# tuples produced within _form_blocks are of the form (placement, array)
1954+
# tuples produced within _form_blocks are of the form (placement, whatever, array)
19591955
if dtype is not None:
19601956
return [
19611957
new_block(

0 commit comments

Comments
 (0)