From 64e969294c7fc6929983c8293f369a94dfed43a0 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 9 Jun 2020 18:51:10 -0700 Subject: [PATCH 1/4] CLN: dont consolidate in reshape.concat --- pandas/core/reshape/concat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index db7e9265ac21d..cb68d1db2e2f6 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -345,7 +345,7 @@ def __init__( if len(objs) == 0: raise ValueError("All objects passed were None") - # consolidate data & figure out what our result ndim is going to be + # figure out what our result ndim is going to be ndims = set() for obj in objs: if not isinstance(obj, (Series, DataFrame)): @@ -355,8 +355,6 @@ def __init__( ) raise TypeError(msg) - # consolidate - obj._consolidate(inplace=True) ndims.add(obj.ndim) # get the sample From 5f6d0624d13c7efc7f6f8761f7a0d29af4086611 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 13 Sep 2020 18:43:07 -0700 Subject: [PATCH 2/4] PERF: optimizations for concat --- pandas/core/dtypes/common.py | 8 ++++++- pandas/core/internals/blocks.py | 42 ++++++++++++++++++++------------- pandas/core/reshape/concat.py | 4 +++- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 5987fdabf78bb..3b1ebc04c3ec7 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._libs import Interval, Period, algos +from pandas._libs import Interval, Period, Timestamp, algos from pandas._libs.tslibs import conversion from pandas._typing import ArrayLike, DtypeObj, Optional @@ -1215,6 +1215,12 @@ def needs_i8_conversion(arr_or_dtype) -> bool: """ if arr_or_dtype is None: return False + if isinstance(arr_or_dtype, (np.dtype, ExtensionDtype)): + # Fastpath + dtype = arr_or_dtype + return ( + dtype.type is Period or dtype.type is Timestamp or dtype.kind in ["m", "M"] + ) return ( is_datetime_or_timedelta_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index eb5b887c8b0cb..5bfebc91358a0 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -6,7 +6,15 @@ import numpy as np -from pandas._libs import NaT, algos as libalgos, lib, writers +from pandas._libs import ( + Interval, + NaT, + Period, + Timestamp, + algos as libalgos, + lib, + writers, +) import pandas._libs.internals as libinternals from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion @@ -39,10 +47,8 @@ is_float_dtype, is_integer, is_integer_dtype, - is_interval_dtype, is_list_like, is_object_dtype, - is_period_dtype, is_re, is_re_compilable, is_sparse, @@ -2733,33 +2739,35 @@ def get_block_type(values, dtype=None): ------- cls : class, subclass of Block """ + # We use vtype and kind checks because they are much more performant + # than is_foo_dtype dtype = dtype or values.dtype vtype = dtype.type + kind = dtype.kind if is_sparse(dtype): # Need this first(ish) so that Sparse[datetime] is sparse cls = ExtensionBlock - elif is_categorical_dtype(values.dtype): + elif dtype.name == "category": cls = CategoricalBlock - elif issubclass(vtype, np.datetime64): - assert not is_datetime64tz_dtype(values.dtype) - cls = DatetimeBlock - elif is_datetime64tz_dtype(values.dtype): + elif vtype is Timestamp: cls = DatetimeTZBlock - elif is_interval_dtype(dtype) or is_period_dtype(dtype): + elif vtype is Interval or vtype is Period: cls = ObjectValuesExtensionBlock - elif is_extension_array_dtype(values.dtype): + elif isinstance(dtype, ExtensionDtype): cls = ExtensionBlock - elif issubclass(vtype, np.floating): - cls = FloatBlock - elif issubclass(vtype, np.timedelta64): - assert issubclass(vtype, np.integer) + + elif kind == "M": + cls = DatetimeBlock + elif kind == "m": cls = TimeDeltaBlock - elif issubclass(vtype, np.complexfloating): + elif kind == "f": + cls = FloatBlock + elif kind == "c": cls = ComplexBlock - elif issubclass(vtype, np.integer): + elif kind == "i" or kind == "u": cls = IntBlock - elif dtype == np.bool_: + elif kind == "b": cls = BoolBlock else: cls = ObjectBlock diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 7a65f97f67d24..e221c5e38059a 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -8,6 +8,7 @@ import numpy as np from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Label +from pandas.util._decorators import cache_readonly from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -515,7 +516,7 @@ def _get_result_dim(self) -> int: def _get_new_axes(self) -> List[Index]: ndim = self._get_result_dim() return [ - self._get_concat_axis() if i == self.bm_axis else self._get_comb_axis(i) + self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i) for i in range(ndim) ] @@ -529,6 +530,7 @@ def _get_comb_axis(self, i: int) -> Index: copy=self.copy, ) + @cache_readonly def _get_concat_axis(self) -> Index: """ Return index to be used along concatenation axis. From 1c69c93a825a19ae070235d13f72ee5c8177688f Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 19 Sep 2020 13:29:28 -0700 Subject: [PATCH 3/4] PERF: fastpaths --- pandas/core/internals/concat.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index f5d0c921e1006..344219bae1f3f 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -67,14 +67,21 @@ def concatenate_block_managers( vals = [ju.block.values for ju in join_units] if not blk.is_extension: - values = concat_compat(vals, axis=blk.ndim - 1) + # _is_uniform_join_units ensures a single dtype, so + # we can use np.concatenate, which is more performant + # than concat_compat + values = np.concatenate(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals) if not isinstance(values, ExtensionArray): values = values.reshape(1, len(values)) - b = make_block(values, placement=placement, ndim=blk.ndim) + if blk.values.dtype == values.dtype: + # Fast-path + b = blk.make_block_same_class(values, placement=placement) + else: + b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( _concatenate_join_units(join_units, concat_axis, copy=copy), From 55637ad2e65decdb6a3f00a21c5fd136bf4c277f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 17 Dec 2020 10:17:11 -0800 Subject: [PATCH 4/4] use isinstance check --- pandas/core/internals/blocks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3e4e3d845d06d..59301391a7dad 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -58,7 +58,7 @@ is_timedelta64_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndex, ABCPandasArray, ABCSeries from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, isna_compat @@ -2647,7 +2647,7 @@ def get_block_type(values, dtype=None): if is_sparse(dtype): # Need this first(ish) so that Sparse[datetime] is sparse cls = ExtensionBlock - elif dtype.name == "category": + elif isinstance(dtype, CategoricalDtype): cls = CategoricalBlock elif vtype is Timestamp: cls = DatetimeTZBlock