From 007efb19d051bad97a9bae4adf642914ec0f0d19 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 30 Sep 2017 17:01:50 +0200 Subject: [PATCH 01/11] REF series concat: concat blocks if all of same type --- pandas/core/internals.py | 23 +++++++++++++++++++++++ pandas/core/reshape/concat.py | 11 ++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 9e348819ce5a3..25fcd6fe82c76 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -312,6 +312,14 @@ def ftype(self): def merge(self, other): return _merge_blocks([self, other]) + def concat_same_type(self, others): + """ + Concatenate list of single blocks of the same type. + """ + values = np.concatenate([self.values] + [o.values for o in others]) + return self.make_block_same_class( + values, placement=slice(0, len(values), 1)) + def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None, mask_info=None): """ @@ -2684,6 +2692,21 @@ def shift(self, periods, axis=0, mgr=None): return [self.make_block_same_class(new_values, placement=self.mgr_locs)] + def concat_same_type(self, others): + """ + Concatenate list of single blocks of the same type. + """ + # can maybe replace + # from pandas.core.dtypes.concat._concat_datetimetz ? + to_concat = [self.values] + [o.values for o in others] + + if len(set([str(x.dtype) for x in to_concat])) != 1: + raise ValueError('to_concat must have the same tz') + + values = to_concat[0]._concat_same_dtype(to_concat, None) + return self.make_block_same_class( + values, placement=slice(0, len(values), 1)) + class SparseBlock(NonConsolidatableMixIn, Block): """ implement as a list of sparse arrays of the same dtype """ diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 4040c65136617..963c531b9b366 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -362,6 +362,16 @@ def get_result(self): # stack blocks if self.axis == 0: + name = com._consensus_name_attr(self.objs) + + # check if all series are of the same block type: + blocks = [obj._data.blocks[0] for obj in self.objs] + if all([type(b) == type(blocks[0]) for b in blocks[1:]]): + new_block = blocks[0].concat_same_type(blocks[1:]) + return (Series(new_block, index=self.new_axes[0], + name=name, fastpath=True) + .__finalize__(self, method='concat')) + # concat Series with length to keep dtype as much non_empties = [x for x in self.objs if len(x) > 0] if len(non_empties) > 0: @@ -370,7 +380,6 @@ def get_result(self): values = [x._values for x in self.objs] new_data = _concat._concat_compat(values) - name = com._consensus_name_attr(self.objs) cons = _concat._get_series_result_type(new_data) return (cons(new_data, index=self.new_axes[0], From dd9babd8cff92fba46852d21a800482123c048fe Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 1 Oct 2017 12:16:29 +0200 Subject: [PATCH 02/11] fix categorical and sparse --- pandas/core/internals.py | 47 ++++++++++++++++++++++++++--------- pandas/core/reshape/concat.py | 26 +++++++++++-------- 2 files changed, 51 insertions(+), 22 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 25fcd6fe82c76..ff343733fc6ba 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -312,11 +312,11 @@ def ftype(self): def merge(self, other): return _merge_blocks([self, other]) - def concat_same_type(self, others): + def concat_same_type(self, to_concat): """ Concatenate list of single blocks of the same type. """ - values = np.concatenate([self.values] + [o.values for o in others]) + values = np.concatenate([blk.values for blk in to_concat]) return self.make_block_same_class( values, placement=slice(0, len(values), 1)) @@ -2415,6 +2415,19 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): # we are expected to return a 2-d ndarray return values.reshape(1, len(values)) + def concat_same_type(self, to_concat): + """ + Concatenate list of single blocks of the same type. + """ + to_concat = [blk.values for blk in to_concat] + values = _concat._concat_categorical(to_concat) + + if is_categorical_dtype(values.dtype): + return self.make_block_same_class( + values, placement=slice(0, len(values), 1)) + else: + return make_block(values, placement=slice(0, len(values), 1)) + class DatetimeBlock(DatetimeLikeBlockMixin, Block): __slots__ = () @@ -2692,20 +2705,18 @@ def shift(self, periods, axis=0, mgr=None): return [self.make_block_same_class(new_values, placement=self.mgr_locs)] - def concat_same_type(self, others): + def concat_same_type(self, to_concat): """ Concatenate list of single blocks of the same type. """ - # can maybe replace - # from pandas.core.dtypes.concat._concat_datetimetz ? - to_concat = [self.values] + [o.values for o in others] - - if len(set([str(x.dtype) for x in to_concat])) != 1: - raise ValueError('to_concat must have the same tz') + to_concat = [blk.values for blk in to_concat] + values = _concat._concat_datetime(to_concat) - values = to_concat[0]._concat_same_dtype(to_concat, None) - return self.make_block_same_class( - values, placement=slice(0, len(values), 1)) + if is_datetimetz(values): + return self.make_block_same_class( + values, placement=slice(0, len(values), 1)) + else: + return make_block(values, placement=slice(0, len(values), 1)) class SparseBlock(NonConsolidatableMixIn, Block): @@ -2874,6 +2885,18 @@ def sparse_reindex(self, new_index): return self.make_block_same_class(values, sparse_index=new_index, placement=self.mgr_locs) + def concat_same_type(self, to_concat): + """ + Concatenate list of single blocks of the same type. + """ + to_concat = [blk.values for blk in to_concat] + values = _concat._concat_sparse(to_concat) + + return self.make_block_same_class( + values, placement=slice(0, len(values), 1)) + #else: + # return make_block(values, placement=slice(0, len(values), 1)) + def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=False): diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 963c531b9b366..8b4248f9fa60f 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -3,13 +3,13 @@ """ import numpy as np -from pandas import compat, DataFrame, Series, Index, MultiIndex +from pandas import compat, DataFrame, Series, Index, MultiIndex, SparseSeries from pandas.core.index import (_get_objs_combined_axis, _ensure_index, _get_consensus_names, _all_indexes_same) from pandas.core.categorical import (_factorize_from_iterable, _factorize_from_iterables) -from pandas.core.internals import concatenate_block_managers +from pandas.core.internals import concatenate_block_managers, SparseBlock from pandas.core import common as com from pandas.core.generic import NDFrame import pandas.core.dtypes.concat as _concat @@ -364,16 +364,22 @@ def get_result(self): if self.axis == 0: name = com._consensus_name_attr(self.objs) - # check if all series are of the same block type: - blocks = [obj._data.blocks[0] for obj in self.objs] - if all([type(b) == type(blocks[0]) for b in blocks[1:]]): - new_block = blocks[0].concat_same_type(blocks[1:]) - return (Series(new_block, index=self.new_axes[0], - name=name, fastpath=True) - .__finalize__(self, method='concat')) - # concat Series with length to keep dtype as much non_empties = [x for x in self.objs if len(x) > 0] + + # check if all series are of the same block type: + if len(non_empties) > 0: + blocks = [obj._data.blocks[0] for obj in non_empties] + if all([type(b) == type(blocks[0]) for b in blocks[1:]]): + new_block = blocks[0].concat_same_type(blocks) + if isinstance(new_block, SparseBlock): + cons = SparseSeries + else: + cons = Series + return (cons(new_block, index=self.new_axes[0], + name=name, fastpath=True) + .__finalize__(self, method='concat')) + if len(non_empties) > 0: values = [x._values for x in non_empties] else: From 8c6f4a7fcdb129765797bae626d6fed830bf5064 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 1 Oct 2017 12:24:07 +0200 Subject: [PATCH 03/11] add specific test for external block --- pandas/tests/internals/test_external_block.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/internals/test_external_block.py b/pandas/tests/internals/test_external_block.py index cccde76c3e1d9..e2bbbbd39ac5c 100644 --- a/pandas/tests/internals/test_external_block.py +++ b/pandas/tests/internals/test_external_block.py @@ -27,3 +27,12 @@ def test_custom_repr(): blk_mgr = BlockManager([block], [['col'], range(3)]) df = pd.DataFrame(blk_mgr) assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2' + + +def test_concat_series(): + values = np.arange(3, dtype='int64') + block = CustomBlock(values, placement=slice(0, 3)) + s = pd.Series(block, pd.RangeIndex(3), fastpath=True) + + res = pd.concat([s, s]) + assert isinstance(res._data.blocks[0], CustomBlock) From d4ce3dff3274cffdfaeb7baa5e7dc2f0c038ceb8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 1 Oct 2017 13:04:42 +0200 Subject: [PATCH 04/11] fix pep8 --- pandas/core/internals.py | 2 -- pandas/core/reshape/concat.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ff343733fc6ba..115a4853640de 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2894,8 +2894,6 @@ def concat_same_type(self, to_concat): return self.make_block_same_class( values, placement=slice(0, len(values), 1)) - #else: - # return make_block(values, placement=slice(0, len(values), 1)) def make_block(values, placement, klass=None, ndim=None, dtype=None, diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 8b4248f9fa60f..dc979d6b28d9c 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -370,7 +370,7 @@ def get_result(self): # check if all series are of the same block type: if len(non_empties) > 0: blocks = [obj._data.blocks[0] for obj in non_empties] - if all([type(b) == type(blocks[0]) for b in blocks[1:]]): + if all([type(b) is type(blocks[0]) for b in blocks[1:]]): # noqa new_block = blocks[0].concat_same_type(blocks) if isinstance(new_block, SparseBlock): cons = SparseSeries From 7676f034d42fffaa2a83f24eeaa971f505710aac Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Oct 2017 00:18:50 +0200 Subject: [PATCH 05/11] move logic into SingleBlockManager.concat --- pandas/core/internals.py | 24 ++++++++++++++++++++++++ pandas/core/reshape/concat.py | 32 +++++++------------------------- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 115a4853640de..4ff5c007ec875 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4534,6 +4534,30 @@ def fast_xs(self, loc): """ return self._block.values[loc] + def concat(self, to_concat, new_axis): + + non_empties = [x for x in to_concat if len(x) > 0] + + # check if all series are of the same block type: + if len(non_empties) > 0: + blocks = [obj.blocks[0] for obj in non_empties] + + if all([type(b) is type(blocks[0]) for b in blocks[1:]]): # noqa + new_block = blocks[0].concat_same_type(blocks) + else: + values = [x.values for x in blocks] + values = _concat._concat_compat(values) + new_block = make_block( + values, placement=slice(0, len(values), 1)) + else: + values = [x._block.values for x in to_concat] + values = _concat._concat_compat(values) + new_block = make_block( + values, placement=slice(0, len(values), 1)) + + mgr = SingleBlockManager(new_block, new_axis) + return mgr + def construction_error(tot_items, block_shape, axes, e=None): """ raise a helpful message about our construction """ diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index dc979d6b28d9c..11b65aa3d1a71 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -364,33 +364,15 @@ def get_result(self): if self.axis == 0: name = com._consensus_name_attr(self.objs) - # concat Series with length to keep dtype as much - non_empties = [x for x in self.objs if len(x) > 0] - - # check if all series are of the same block type: - if len(non_empties) > 0: - blocks = [obj._data.blocks[0] for obj in non_empties] - if all([type(b) is type(blocks[0]) for b in blocks[1:]]): # noqa - new_block = blocks[0].concat_same_type(blocks) - if isinstance(new_block, SparseBlock): - cons = SparseSeries - else: - cons = Series - return (cons(new_block, index=self.new_axes[0], - name=name, fastpath=True) - .__finalize__(self, method='concat')) - - if len(non_empties) > 0: - values = [x._values for x in non_empties] - else: - values = [x._values for x in self.objs] - new_data = _concat._concat_compat(values) + mgr = self.objs[0]._data.concat([x._data for x in self.objs], + self.new_axes) - cons = _concat._get_series_result_type(new_data) + if mgr._block.is_sparse: + cons = SparseSeries + else: + cons = self.objs[0].__class__ - return (cons(new_data, index=self.new_axes[0], - name=name, dtype=new_data.dtype) - .__finalize__(self, method='concat')) + return cons(mgr, name=name).__finalize__(self, method='concat') # combine as columns in a frame else: From 1c35aca86dfdcf950e09fd8b7e486ed448784af7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 Oct 2017 11:04:12 +0200 Subject: [PATCH 06/11] use Block.concat_same_type in concatenate_block_managers (concatting DataFrames) --- pandas/core/internals.py | 58 ++++++++++++++----- pandas/tests/internals/test_external_block.py | 29 +++++++++- 2 files changed, 70 insertions(+), 17 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4ff5c007ec875..f209d2bad7a5b 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -312,13 +312,14 @@ def ftype(self): def merge(self, other): return _merge_blocks([self, other]) - def concat_same_type(self, to_concat): + def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. """ - values = np.concatenate([blk.values for blk in to_concat]) + values = np.concatenate([blk.values for blk in to_concat], + axis=self.ndim - 1) return self.make_block_same_class( - values, placement=slice(0, len(values), 1)) + values, placement=placement or slice(0, len(values), 1)) def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None, mask_info=None): @@ -2415,7 +2416,7 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): # we are expected to return a 2-d ndarray return values.reshape(1, len(values)) - def concat_same_type(self, to_concat): + def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. """ @@ -2424,9 +2425,9 @@ def concat_same_type(self, to_concat): if is_categorical_dtype(values.dtype): return self.make_block_same_class( - values, placement=slice(0, len(values), 1)) + values, placement=placement or slice(0, len(values), 1)) else: - return make_block(values, placement=slice(0, len(values), 1)) + return make_block(values, placement=placement or slice(0, len(values), 1)) class DatetimeBlock(DatetimeLikeBlockMixin, Block): @@ -2705,7 +2706,7 @@ def shift(self, periods, axis=0, mgr=None): return [self.make_block_same_class(new_values, placement=self.mgr_locs)] - def concat_same_type(self, to_concat): + def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. """ @@ -2714,9 +2715,9 @@ def concat_same_type(self, to_concat): if is_datetimetz(values): return self.make_block_same_class( - values, placement=slice(0, len(values), 1)) + values, placement=placement or slice(0, len(values), 1)) else: - return make_block(values, placement=slice(0, len(values), 1)) + return make_block(values, placement=placement or slice(0, len(values), 1)) class SparseBlock(NonConsolidatableMixIn, Block): @@ -2885,7 +2886,7 @@ def sparse_reindex(self, new_index): return self.make_block_same_class(values, sparse_index=new_index, placement=self.mgr_locs) - def concat_same_type(self, to_concat): + def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. """ @@ -2893,7 +2894,7 @@ def concat_same_type(self, to_concat): values = _concat._concat_sparse(to_concat) return self.make_block_same_class( - values, placement=slice(0, len(values), 1)) + values, placement=placement or slice(0, len(values), 1)) def make_block(values, placement, klass=None, ndim=None, dtype=None, @@ -5146,13 +5147,42 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): [get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers], concat_axis) - blocks = [make_block( - concatenate_join_units(join_units, concat_axis, copy=copy), - placement=placement) for placement, join_units in concat_plan] + blocks = [] + + for placement, join_units in concat_plan: + + if is_uniform_join_units(join_units): + b = join_units[0].block.concat_same_type( + [ju.block for ju in join_units], placement=placement) + else: + b = make_block( + concatenate_join_units(join_units, concat_axis, copy=copy), + placement=placement) + blocks.append(b) return BlockManager(blocks, axes) +def is_uniform_join_units(join_units): + """ + Check if the join units consist of blocks of uniform type that can + be concatenated using Block.concat_same_type instead of the generic + concatenate_join_units (which uses `_concat._concat_compat`). + + """ + return ( + # all blocks need to have the same type + all([type(ju.block) is type(join_units[0].block) for ju in join_units]) # noqa + # no blocks that would get missing values (can lead to type upcasts) + and all([not ju.is_na for ju in join_units]) + # no blocks with indexers (as then the dimensions do not fit) + and all([not ju.indexers for ju in join_units]) + # disregard Panels + and all([ju.block.ndim <= 2 for ju in join_units]) + # only use this path when there is something to concatenate + and len(join_units) > 1) + + def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. diff --git a/pandas/tests/internals/test_external_block.py b/pandas/tests/internals/test_external_block.py index e2bbbbd39ac5c..4b10b8cdd72d0 100644 --- a/pandas/tests/internals/test_external_block.py +++ b/pandas/tests/internals/test_external_block.py @@ -4,14 +4,25 @@ import numpy as np import pandas as pd -from pandas.core.internals import Block, BlockManager, SingleBlockManager +from pandas.core.internals import Block, BlockManager, SingleBlockManager, NonConsolidatableMixIn -class CustomBlock(Block): +class CustomBlock(NonConsolidatableMixIn, Block): + + _holder = np.ndarray def formatting_values(self): return np.array(["Val: {}".format(i) for i in self.values]) + def concat_same_type(self, to_concat, placement=None): + """ + Always concatenate disregarding self.ndim as the values are + always 1D in this custom Block + """ + values = np.concatenate([blk.values for blk in to_concat]) + return self.make_block_same_class( + values, placement=placement or slice(0, len(values), 1)) + def test_custom_repr(): values = np.arange(3, dtype='int64') @@ -23,7 +34,7 @@ def test_custom_repr(): assert repr(s) == '0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64' # dataframe - block = CustomBlock(values.reshape(1, -1), placement=slice(0, 1)) + block = CustomBlock(values, placement=slice(0, 1)) blk_mgr = BlockManager([block], [['col'], range(3)]) df = pd.DataFrame(blk_mgr) assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2' @@ -36,3 +47,15 @@ def test_concat_series(): res = pd.concat([s, s]) assert isinstance(res._data.blocks[0], CustomBlock) + + +def test_concat_dataframe(): + df = pd.DataFrame({'a': [1, 2, 3]}) + blocks = df._data.blocks + values = np.arange(3, dtype='int64') + custom_block = CustomBlock(values, placement=slice(1, 2)) + blocks = blocks + (custom_block, ) + block_manager = BlockManager(blocks, [pd.Index(['a', 'b']), df.index]) + df = pd.DataFrame(block_manager) + res = pd.concat([df, df]) + assert isinstance(res._data.blocks[1], CustomBlock) From bd561d91f31907ca9ce9174874b902b4bd70d590 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 Oct 2017 11:54:34 +0200 Subject: [PATCH 07/11] fix categorical and datetimetz (when in dataframe + converted to object) --- pandas/core/internals.py | 21 +++++++++++-------- pandas/core/reshape/concat.py | 2 +- pandas/tests/internals/test_external_block.py | 3 ++- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f209d2bad7a5b..b0b55706abd18 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2421,13 +2421,15 @@ def concat_same_type(self, to_concat, placement=None): Concatenate list of single blocks of the same type. """ to_concat = [blk.values for blk in to_concat] - values = _concat._concat_categorical(to_concat) + values = _concat._concat_categorical(to_concat, axis=self.ndim - 1) if is_categorical_dtype(values.dtype): return self.make_block_same_class( values, placement=placement or slice(0, len(values), 1)) else: - return make_block(values, placement=placement or slice(0, len(values), 1)) + return make_block( + values, placement=placement or slice(0, len(values), 1), + ndim=self.ndim) class DatetimeBlock(DatetimeLikeBlockMixin, Block): @@ -2711,13 +2713,14 @@ def concat_same_type(self, to_concat, placement=None): Concatenate list of single blocks of the same type. """ to_concat = [blk.values for blk in to_concat] - values = _concat._concat_datetime(to_concat) + values = _concat._concat_datetime(to_concat, axis=self.ndim - 1) if is_datetimetz(values): return self.make_block_same_class( values, placement=placement or slice(0, len(values), 1)) else: - return make_block(values, placement=placement or slice(0, len(values), 1)) + return make_block( + values, placement=placement or slice(0, len(values), 1)) class SparseBlock(NonConsolidatableMixIn, Block): @@ -5172,15 +5175,15 @@ def is_uniform_join_units(join_units): """ return ( # all blocks need to have the same type - all([type(ju.block) is type(join_units[0].block) for ju in join_units]) # noqa + all([type(ju.block) is type(join_units[0].block) for ju in join_units]) and # noqa # no blocks that would get missing values (can lead to type upcasts) - and all([not ju.is_na for ju in join_units]) + all([not ju.is_na for ju in join_units]) and # no blocks with indexers (as then the dimensions do not fit) - and all([not ju.indexers for ju in join_units]) + all([not ju.indexers for ju in join_units]) and # disregard Panels - and all([ju.block.ndim <= 2 for ju in join_units]) + all([ju.block.ndim <= 2 for ju in join_units]) and # only use this path when there is something to concatenate - and len(join_units) > 1) + len(join_units) > 1) def get_empty_dtype_and_na(join_units): diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 11b65aa3d1a71..c2aeef0a30334 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -9,7 +9,7 @@ _all_indexes_same) from pandas.core.categorical import (_factorize_from_iterable, _factorize_from_iterables) -from pandas.core.internals import concatenate_block_managers, SparseBlock +from pandas.core.internals import concatenate_block_managers from pandas.core import common as com from pandas.core.generic import NDFrame import pandas.core.dtypes.concat as _concat diff --git a/pandas/tests/internals/test_external_block.py b/pandas/tests/internals/test_external_block.py index 4b10b8cdd72d0..aa581ccd3bf8a 100644 --- a/pandas/tests/internals/test_external_block.py +++ b/pandas/tests/internals/test_external_block.py @@ -4,7 +4,8 @@ import numpy as np import pandas as pd -from pandas.core.internals import Block, BlockManager, SingleBlockManager, NonConsolidatableMixIn +from pandas.core.internals import ( + Block, BlockManager, SingleBlockManager, NonConsolidatableMixIn) class CustomBlock(NonConsolidatableMixIn, Block): From 0c90561b4d5f0a4b3dd434e52d1c44f73aaa2b55 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 Oct 2017 12:31:13 +0200 Subject: [PATCH 08/11] small clean-up --- pandas/core/dtypes/concat.py | 10 +++++----- pandas/core/internals.py | 15 +++++++++++++++ pandas/core/reshape/concat.py | 7 +------ 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index f6f956832eebe..93993fd0a0cab 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -63,11 +63,12 @@ def get_dtype_kinds(l): return typs -def _get_series_result_type(result): +def _get_series_result_type(result, objs=None): """ return appropriate class of Series concat input is either dict or array-like """ + # concat Series with axis 1 if isinstance(result, dict): # concat Series with axis 1 if all(is_sparse(c) for c in compat.itervalues(result)): @@ -77,13 +78,12 @@ def _get_series_result_type(result): from pandas.core.frame import DataFrame return DataFrame - elif is_sparse(result): - # concat Series with axis 1 + # otherwise it is a SingleBlockManager (axis = 0) + if result._block.is_sparse: from pandas.core.sparse.api import SparseSeries return SparseSeries else: - from pandas.core.series import Series - return Series + return objs[0]._constructor def _get_frame_result_type(result, objs): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index b0b55706abd18..be031827ee619 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4539,7 +4539,22 @@ def fast_xs(self, loc): return self._block.values[loc] def concat(self, to_concat, new_axis): + """ + Concatenate a list of SingleBlockManagers into a single + SingleBlockManager. + + Used for pd.concat of Series objects with axis=0. + + Parameters + ---------- + to_concat : list of SingleBlockManagers + new_axis : Index of the result + Returns + ------- + SingleBlockManager + + """ non_empties = [x for x in to_concat if len(x) > 0] # check if all series are of the same block type: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index c2aeef0a30334..5d065d53bc827 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -366,12 +366,7 @@ def get_result(self): mgr = self.objs[0]._data.concat([x._data for x in self.objs], self.new_axes) - - if mgr._block.is_sparse: - cons = SparseSeries - else: - cons = self.objs[0].__class__ - + cons = _concat._get_series_result_type(mgr, self.objs) return cons(mgr, name=name).__finalize__(self, method='concat') # combine as columns in a frame From d3a79e7f8f0f8aa6ec65bc2e8300b7bc795e1798 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 Oct 2017 13:33:46 +0200 Subject: [PATCH 09/11] fix pep8 --- pandas/core/reshape/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 5d065d53bc827..c54763f8ebde1 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -3,7 +3,7 @@ """ import numpy as np -from pandas import compat, DataFrame, Series, Index, MultiIndex, SparseSeries +from pandas import compat, DataFrame, Series, Index, MultiIndex from pandas.core.index import (_get_objs_combined_axis, _ensure_index, _get_consensus_names, _all_indexes_same) From 315f16bc567491dfae743ddb7a455d6650009b9b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Oct 2017 17:29:18 +0200 Subject: [PATCH 10/11] remove if/else in categorical/datetimetz concat_same_type --- pandas/core/internals.py | 20 +++++++------------ pandas/tests/internals/test_external_block.py | 2 ++ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 322ff0415367e..cc53b02151462 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2448,13 +2448,10 @@ def concat_same_type(self, to_concat, placement=None): to_concat = [blk.values for blk in to_concat] values = _concat._concat_categorical(to_concat, axis=self.ndim - 1) - if is_categorical_dtype(values.dtype): - return self.make_block_same_class( - values, placement=placement or slice(0, len(values), 1)) - else: - return make_block( - values, placement=placement or slice(0, len(values), 1), - ndim=self.ndim) + # not using self.make_block_same_class as values can be object dtype + return make_block( + values, placement=placement or slice(0, len(values), 1), + ndim=self.ndim) class DatetimeBlock(DatetimeLikeBlockMixin, Block): @@ -2742,12 +2739,9 @@ def concat_same_type(self, to_concat, placement=None): to_concat = [blk.values for blk in to_concat] values = _concat._concat_datetime(to_concat, axis=self.ndim - 1) - if is_datetimetz(values): - return self.make_block_same_class( - values, placement=placement or slice(0, len(values), 1)) - else: - return make_block( - values, placement=placement or slice(0, len(values), 1)) + # not using self.make_block_same_class as values can be non-tz dtype + return make_block( + values, placement=placement or slice(0, len(values), 1)) class SparseBlock(NonConsolidatableMixIn, Block): diff --git a/pandas/tests/internals/test_external_block.py b/pandas/tests/internals/test_external_block.py index aa581ccd3bf8a..d98b293ed8daa 100644 --- a/pandas/tests/internals/test_external_block.py +++ b/pandas/tests/internals/test_external_block.py @@ -42,6 +42,7 @@ def test_custom_repr(): def test_concat_series(): + # GH17728 values = np.arange(3, dtype='int64') block = CustomBlock(values, placement=slice(0, 3)) s = pd.Series(block, pd.RangeIndex(3), fastpath=True) @@ -51,6 +52,7 @@ def test_concat_series(): def test_concat_dataframe(): + # GH17728 df = pd.DataFrame({'a': [1, 2, 3]}) blocks = df._data.blocks values = np.arange(3, dtype='int64') From bb5a100b314d7f19dda19bfb5ef0889ac3dd5e36 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 Oct 2017 01:27:03 +0200 Subject: [PATCH 11/11] Use _concatenator attribute on Block class --- pandas/core/internals.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index cc53b02151462..f4f231be570c2 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -102,6 +102,7 @@ class Block(PandasObject): _validate_ndim = True _ftype = 'dense' _holder = None + _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None, fastpath=False): if ndim is None: @@ -318,8 +319,8 @@ def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. """ - values = np.concatenate([blk.values for blk in to_concat], - axis=self.ndim - 1) + values = self._concatenator([blk.values for blk in to_concat], + axis=self.ndim - 1) return self.make_block_same_class( values, placement=placement or slice(0, len(values), 1)) @@ -2318,6 +2319,7 @@ class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): _verify_integrity = True _can_hold_na = True _holder = Categorical + _concatenator = staticmethod(_concat._concat_categorical) def __init__(self, values, placement, fastpath=False, **kwargs): @@ -2445,9 +2447,8 @@ def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. """ - to_concat = [blk.values for blk in to_concat] - values = _concat._concat_categorical(to_concat, axis=self.ndim - 1) - + values = self._concatenator([blk.values for blk in to_concat], + axis=self.ndim - 1) # not using self.make_block_same_class as values can be object dtype return make_block( values, placement=placement or slice(0, len(values), 1), @@ -2592,6 +2593,7 @@ class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ __slots__ = () _holder = DatetimeIndex + _concatenator = staticmethod(_concat._concat_datetime) is_datetimetz = True def __init__(self, values, placement, ndim=2, **kwargs): @@ -2736,9 +2738,8 @@ def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. """ - to_concat = [blk.values for blk in to_concat] - values = _concat._concat_datetime(to_concat, axis=self.ndim - 1) - + values = self._concatenator([blk.values for blk in to_concat], + axis=self.ndim - 1) # not using self.make_block_same_class as values can be non-tz dtype return make_block( values, placement=placement or slice(0, len(values), 1)) @@ -2753,6 +2754,7 @@ class SparseBlock(NonConsolidatableMixIn, Block): _can_hold_na = True _ftype = 'sparse' _holder = SparseArray + _concatenator = staticmethod(_concat._concat_sparse) @property def shape(self): @@ -2910,16 +2912,6 @@ def sparse_reindex(self, new_index): return self.make_block_same_class(values, sparse_index=new_index, placement=self.mgr_locs) - def concat_same_type(self, to_concat, placement=None): - """ - Concatenate list of single blocks of the same type. - """ - to_concat = [blk.values for blk in to_concat] - values = _concat._concat_sparse(to_concat) - - return self.make_block_same_class( - values, placement=placement or slice(0, len(values), 1)) - def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=False):