Skip to content

Commit ab3d70c

Browse files
jbrockmendelCGe0516
authored andcommitted
PERF: DataFrame constructor (pandas-dev#42702)
1 parent 8ceb31b commit ab3d70c

File tree

4 files changed

+44
-23
lines changed

4 files changed

+44
-23
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ Performance improvements
166166
~~~~~~~~~~~~~~~~~~~~~~~~
167167
- Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`)
168168
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
169+
- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)
169170

170171
.. ---------------------------------------------------------------------------
171172

pandas/core/internals/blocks.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
from pandas._libs.internals import BlockPlacement
2626
from pandas._typing import (
2727
ArrayLike,
28-
Dtype,
2928
DtypeObj,
3029
F,
3130
Shape,
@@ -52,7 +51,6 @@
5251
is_list_like,
5352
is_sparse,
5453
is_string_dtype,
55-
pandas_dtype,
5654
)
5755
from pandas.core.dtypes.dtypes import (
5856
CategoricalDtype,
@@ -100,6 +98,7 @@
10098
TimedeltaArray,
10199
)
102100
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
101+
from pandas.core.arrays.sparse import SparseDtype
103102
from pandas.core.base import PandasObject
104103
import pandas.core.common as com
105104
import pandas.core.computation.expressions as expressions
@@ -326,6 +325,8 @@ def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block:
326325

327326
return type(self)(new_values, new_mgr_locs, self.ndim)
328327

328+
# NB: this cannot be made cache_readonly because in libreduction we pin
329+
# new .values that can have different shape GH#42631
329330
@property
330331
def shape(self) -> Shape:
331332
return self.values.shape
@@ -1843,7 +1844,7 @@ class CategoricalBlock(ExtensionBlock):
18431844
# Constructor Helpers
18441845

18451846

1846-
def maybe_coerce_values(values) -> ArrayLike:
1847+
def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
18471848
"""
18481849
Input validation for values passed to __init__. Ensure that
18491850
any datetime64/timedelta64 dtypes are in nanoseconds. Ensure
@@ -1875,7 +1876,7 @@ def maybe_coerce_values(values) -> ArrayLike:
18751876
return values
18761877

18771878

1878-
def get_block_type(values, dtype: Dtype | None = None):
1879+
def get_block_type(values, dtype: DtypeObj | None = None):
18791880
"""
18801881
Find the appropriate Block subclass to use for the given values and dtype.
18811882
@@ -1890,13 +1891,15 @@ def get_block_type(values, dtype: Dtype | None = None):
18901891
"""
18911892
# We use vtype and kind checks because they are much more performant
18921893
# than is_foo_dtype
1893-
dtype = cast(np.dtype, pandas_dtype(dtype) if dtype else values.dtype)
1894+
if dtype is None:
1895+
dtype = values.dtype
1896+
18941897
vtype = dtype.type
18951898
kind = dtype.kind
18961899

18971900
cls: type[Block]
18981901

1899-
if is_sparse(dtype):
1902+
if isinstance(dtype, SparseDtype):
19001903
# Need this first(ish) so that Sparse[datetime] is sparse
19011904
cls = ExtensionBlock
19021905
elif isinstance(dtype, CategoricalDtype):

pandas/core/internals/managers.py

+21-17
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ class BaseBlockManager(DataManager):
148148
_known_consolidated: bool
149149
_is_consolidated: bool
150150

151-
def __init__(self, blocks, axes, verify_integrity=True):
151+
def __init__(self, blocks, axes, verify_integrity: bool = True):
152152
raise NotImplementedError
153153

154154
@classmethod
@@ -889,7 +889,8 @@ def __init__(
889889
):
890890

891891
if verify_integrity:
892-
assert all(isinstance(x, Index) for x in axes)
892+
# Assertion disabled for performance
893+
# assert all(isinstance(x, Index) for x in axes)
893894

894895
for block in blocks:
895896
if self.ndim != block.ndim:
@@ -1563,8 +1564,9 @@ def __init__(
15631564
verify_integrity: bool = False,
15641565
fastpath=lib.no_default,
15651566
):
1566-
assert isinstance(block, Block), type(block)
1567-
assert isinstance(axis, Index), type(axis)
1567+
# Assertions disabled for performance
1568+
# assert isinstance(block, Block), type(block)
1569+
# assert isinstance(axis, Index), type(axis)
15681570

15691571
if fastpath is not lib.no_default:
15701572
warnings.warn(
@@ -1665,7 +1667,8 @@ def getitem_mgr(self, indexer) -> SingleBlockManager:
16651667
return type(self)(block, new_idx)
16661668

16671669
def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager:
1668-
assert isinstance(slobj, slice), type(slobj)
1670+
# Assertion disabled for performance
1671+
# assert isinstance(slobj, slice), type(slobj)
16691672
if axis >= self.ndim:
16701673
raise IndexError("Requested axis not found in manager")
16711674

@@ -1783,9 +1786,10 @@ def create_block_manager_from_arrays(
17831786
axes: list[Index],
17841787
consolidate: bool = True,
17851788
) -> BlockManager:
1786-
assert isinstance(names, Index)
1787-
assert isinstance(axes, list)
1788-
assert all(isinstance(x, Index) for x in axes)
1789+
# Assertions disabled for performance
1790+
# assert isinstance(names, Index)
1791+
# assert isinstance(axes, list)
1792+
# assert all(isinstance(x, Index) for x in axes)
17891793

17901794
arrays = [_extract_array(x) for x in arrays]
17911795

@@ -1840,7 +1844,8 @@ def _form_blocks(
18401844
if names_idx.equals(axes[0]):
18411845
names_indexer = np.arange(len(names_idx))
18421846
else:
1843-
assert names_idx.intersection(axes[0]).is_unique
1847+
# Assertion disabled for performance
1848+
# assert names_idx.intersection(axes[0]).is_unique
18441849
names_indexer = names_idx.get_indexer_for(axes[0])
18451850

18461851
for i, name_idx in enumerate(names_indexer):
@@ -1868,10 +1873,9 @@ def _form_blocks(
18681873

18691874
if len(items_dict["DatetimeTZBlock"]):
18701875
dttz_blocks = [
1871-
new_block(
1876+
DatetimeTZBlock(
18721877
ensure_block_shape(extract_array(array), 2),
1873-
klass=DatetimeTZBlock,
1874-
placement=i,
1878+
placement=BlockPlacement(i),
18751879
ndim=2,
18761880
)
18771881
for i, array in items_dict["DatetimeTZBlock"]
@@ -1886,14 +1890,14 @@ def _form_blocks(
18861890

18871891
if len(items_dict["CategoricalBlock"]) > 0:
18881892
cat_blocks = [
1889-
new_block(array, klass=CategoricalBlock, placement=i, ndim=2)
1893+
CategoricalBlock(array, placement=BlockPlacement(i), ndim=2)
18901894
for i, array in items_dict["CategoricalBlock"]
18911895
]
18921896
blocks.extend(cat_blocks)
18931897

18941898
if len(items_dict["ExtensionBlock"]):
18951899
external_blocks = [
1896-
new_block(array, klass=ExtensionBlock, placement=i, ndim=2)
1900+
ExtensionBlock(array, placement=BlockPlacement(i), ndim=2)
18971901
for i, array in items_dict["ExtensionBlock"]
18981902
]
18991903

@@ -1926,7 +1930,7 @@ def _simple_blockify(tuples, dtype, consolidate: bool) -> list[Block]:
19261930
if dtype is not None and values.dtype != dtype: # pragma: no cover
19271931
values = values.astype(dtype)
19281932

1929-
block = new_block(values, placement=placement, ndim=2)
1933+
block = new_block(values, placement=BlockPlacement(placement), ndim=2)
19301934
return [block]
19311935

19321936

@@ -1949,14 +1953,14 @@ def _multi_blockify(tuples, dtype: DtypeObj | None = None, consolidate: bool = T
19491953
list(tup_block), dtype # type: ignore[arg-type]
19501954
)
19511955

1952-
block = new_block(values, placement=placement, ndim=2)
1956+
block = new_block(values, placement=BlockPlacement(placement), ndim=2)
19531957
new_blocks.append(block)
19541958

19551959
return new_blocks
19561960

19571961

19581962
def _tuples_to_blocks_no_consolidate(tuples, dtype: DtypeObj | None) -> list[Block]:
1959-
# tuples produced within _form_blocks are of the form (placement, whatever, array)
1963+
# tuples produced within _form_blocks are of the form (placement, array)
19601964
if dtype is not None:
19611965
return [
19621966
new_block(

pandas/tests/groupby/test_apply.py

+13
Original file line numberDiff line numberDiff line change
@@ -1178,3 +1178,16 @@ def test_positional_slice_groups_datetimelike():
11781178
lambda x: x.iloc[0:]
11791179
)
11801180
tm.assert_frame_equal(result, expected)
1181+
1182+
1183+
def test_doctest_example2():
1184+
# GH#42702 this fails if we cache_readonly Block.shape
1185+
# TODO: more informative name
1186+
df = DataFrame({"A": ["a", "a", "b"], "B": [1, 2, 3], "C": [4, 6, 5]})
1187+
gb = df.groupby("A")
1188+
result = gb[["B", "C"]].apply(lambda x: x.astype(float).max() - x.min())
1189+
1190+
expected = DataFrame(
1191+
{"B": [1.0, 0.0], "C": [2.0, 0.0]}, index=Index(["a", "b"], name="A")
1192+
)
1193+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)