Skip to content

Commit b50d765

Browse files
jorisvandenbosscheNo-Stream
authored andcommitted
REF/INT: concat blocks of same type with preserving block type (pandas-dev#17728)
1 parent 4c16faa commit b50d765

File tree

4 files changed

+152
-23
lines changed

4 files changed

+152
-23
lines changed

pandas/core/dtypes/concat.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,12 @@ def get_dtype_kinds(l):
6363
return typs
6464

6565

66-
def _get_series_result_type(result):
66+
def _get_series_result_type(result, objs=None):
6767
"""
6868
return appropriate class of Series concat
6969
input is either dict or array-like
7070
"""
71+
# concat Series with axis 1
7172
if isinstance(result, dict):
7273
# concat Series with axis 1
7374
if all(is_sparse(c) for c in compat.itervalues(result)):
@@ -77,13 +78,12 @@ def _get_series_result_type(result):
7778
from pandas.core.frame import DataFrame
7879
return DataFrame
7980

80-
elif is_sparse(result):
81-
# concat Series with axis 1
81+
# otherwise it is a SingleBlockManager (axis = 0)
82+
if result._block.is_sparse:
8283
from pandas.core.sparse.api import SparseSeries
8384
return SparseSeries
8485
else:
85-
from pandas.core.series import Series
86-
return Series
86+
return objs[0]._constructor
8787

8888

8989
def _get_frame_result_type(result, objs):

pandas/core/internals.py

+105-3
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ class Block(PandasObject):
102102
_validate_ndim = True
103103
_ftype = 'dense'
104104
_holder = None
105+
_concatenator = staticmethod(np.concatenate)
105106

106107
def __init__(self, values, placement, ndim=None, fastpath=False):
107108
if ndim is None:
@@ -314,6 +315,15 @@ def ftype(self):
314315
def merge(self, other):
315316
return _merge_blocks([self, other])
316317

318+
def concat_same_type(self, to_concat, placement=None):
319+
"""
320+
Concatenate list of single blocks of the same type.
321+
"""
322+
values = self._concatenator([blk.values for blk in to_concat],
323+
axis=self.ndim - 1)
324+
return self.make_block_same_class(
325+
values, placement=placement or slice(0, len(values), 1))
326+
317327
def reindex_axis(self, indexer, method=None, axis=1, fill_value=None,
318328
limit=None, mask_info=None):
319329
"""
@@ -2309,6 +2319,7 @@ class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock):
23092319
_verify_integrity = True
23102320
_can_hold_na = True
23112321
_holder = Categorical
2322+
_concatenator = staticmethod(_concat._concat_categorical)
23122323

23132324
def __init__(self, values, placement, fastpath=False, **kwargs):
23142325

@@ -2432,6 +2443,17 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
24322443
# we are expected to return a 2-d ndarray
24332444
return values.reshape(1, len(values))
24342445

2446+
def concat_same_type(self, to_concat, placement=None):
2447+
"""
2448+
Concatenate list of single blocks of the same type.
2449+
"""
2450+
values = self._concatenator([blk.values for blk in to_concat],
2451+
axis=self.ndim - 1)
2452+
# not using self.make_block_same_class as values can be object dtype
2453+
return make_block(
2454+
values, placement=placement or slice(0, len(values), 1),
2455+
ndim=self.ndim)
2456+
24352457

24362458
class DatetimeBlock(DatetimeLikeBlockMixin, Block):
24372459
__slots__ = ()
@@ -2571,6 +2593,7 @@ class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock):
25712593
""" implement a datetime64 block with a tz attribute """
25722594
__slots__ = ()
25732595
_holder = DatetimeIndex
2596+
_concatenator = staticmethod(_concat._concat_datetime)
25742597
is_datetimetz = True
25752598

25762599
def __init__(self, values, placement, ndim=2, **kwargs):
@@ -2711,6 +2734,16 @@ def shift(self, periods, axis=0, mgr=None):
27112734
return [self.make_block_same_class(new_values,
27122735
placement=self.mgr_locs)]
27132736

2737+
def concat_same_type(self, to_concat, placement=None):
2738+
"""
2739+
Concatenate list of single blocks of the same type.
2740+
"""
2741+
values = self._concatenator([blk.values for blk in to_concat],
2742+
axis=self.ndim - 1)
2743+
# not using self.make_block_same_class as values can be non-tz dtype
2744+
return make_block(
2745+
values, placement=placement or slice(0, len(values), 1))
2746+
27142747

27152748
class SparseBlock(NonConsolidatableMixIn, Block):
27162749
""" implement as a list of sparse arrays of the same dtype """
@@ -2721,6 +2754,7 @@ class SparseBlock(NonConsolidatableMixIn, Block):
27212754
_can_hold_na = True
27222755
_ftype = 'sparse'
27232756
_holder = SparseArray
2757+
_concatenator = staticmethod(_concat._concat_sparse)
27242758

27252759
@property
27262760
def shape(self):
@@ -4517,6 +4551,45 @@ def fast_xs(self, loc):
45174551
"""
45184552
return self._block.values[loc]
45194553

4554+
def concat(self, to_concat, new_axis):
4555+
"""
4556+
Concatenate a list of SingleBlockManagers into a single
4557+
SingleBlockManager.
4558+
4559+
Used for pd.concat of Series objects with axis=0.
4560+
4561+
Parameters
4562+
----------
4563+
to_concat : list of SingleBlockManagers
4564+
new_axis : Index of the result
4565+
4566+
Returns
4567+
-------
4568+
SingleBlockManager
4569+
4570+
"""
4571+
non_empties = [x for x in to_concat if len(x) > 0]
4572+
4573+
# check if all series are of the same block type:
4574+
if len(non_empties) > 0:
4575+
blocks = [obj.blocks[0] for obj in non_empties]
4576+
4577+
if all([type(b) is type(blocks[0]) for b in blocks[1:]]): # noqa
4578+
new_block = blocks[0].concat_same_type(blocks)
4579+
else:
4580+
values = [x.values for x in blocks]
4581+
values = _concat._concat_compat(values)
4582+
new_block = make_block(
4583+
values, placement=slice(0, len(values), 1))
4584+
else:
4585+
values = [x._block.values for x in to_concat]
4586+
values = _concat._concat_compat(values)
4587+
new_block = make_block(
4588+
values, placement=slice(0, len(values), 1))
4589+
4590+
mgr = SingleBlockManager(new_block, new_axis)
4591+
return mgr
4592+
45204593

45214594
def construction_error(tot_items, block_shape, axes, e=None):
45224595
""" raise a helpful message about our construction """
@@ -5105,13 +5178,42 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
51055178
[get_mgr_concatenation_plan(mgr, indexers)
51065179
for mgr, indexers in mgrs_indexers], concat_axis)
51075180

5108-
blocks = [make_block(
5109-
concatenate_join_units(join_units, concat_axis, copy=copy),
5110-
placement=placement) for placement, join_units in concat_plan]
5181+
blocks = []
5182+
5183+
for placement, join_units in concat_plan:
5184+
5185+
if is_uniform_join_units(join_units):
5186+
b = join_units[0].block.concat_same_type(
5187+
[ju.block for ju in join_units], placement=placement)
5188+
else:
5189+
b = make_block(
5190+
concatenate_join_units(join_units, concat_axis, copy=copy),
5191+
placement=placement)
5192+
blocks.append(b)
51115193

51125194
return BlockManager(blocks, axes)
51135195

51145196

5197+
def is_uniform_join_units(join_units):
5198+
"""
5199+
Check if the join units consist of blocks of uniform type that can
5200+
be concatenated using Block.concat_same_type instead of the generic
5201+
concatenate_join_units (which uses `_concat._concat_compat`).
5202+
5203+
"""
5204+
return (
5205+
# all blocks need to have the same type
5206+
all([type(ju.block) is type(join_units[0].block) for ju in join_units]) and # noqa
5207+
# no blocks that would get missing values (can lead to type upcasts)
5208+
all([not ju.is_na for ju in join_units]) and
5209+
# no blocks with indexers (as then the dimensions do not fit)
5210+
all([not ju.indexers for ju in join_units]) and
5211+
# disregard Panels
5212+
all([ju.block.ndim <= 2 for ju in join_units]) and
5213+
# only use this path when there is something to concatenate
5214+
len(join_units) > 1)
5215+
5216+
51155217
def get_empty_dtype_and_na(join_units):
51165218
"""
51175219
Return dtype and N/A values to use when concatenating specified units.

pandas/core/reshape/concat.py

+4-12
Original file line numberDiff line numberDiff line change
@@ -362,20 +362,12 @@ def get_result(self):
362362

363363
# stack blocks
364364
if self.axis == 0:
365-
# concat Series with length to keep dtype as much
366-
non_empties = [x for x in self.objs if len(x) > 0]
367-
if len(non_empties) > 0:
368-
values = [x._values for x in non_empties]
369-
else:
370-
values = [x._values for x in self.objs]
371-
new_data = _concat._concat_compat(values)
372-
373365
name = com._consensus_name_attr(self.objs)
374-
cons = _concat._get_series_result_type(new_data)
375366

376-
return (cons(new_data, index=self.new_axes[0],
377-
name=name, dtype=new_data.dtype)
378-
.__finalize__(self, method='concat'))
367+
mgr = self.objs[0]._data.concat([x._data for x in self.objs],
368+
self.new_axes)
369+
cons = _concat._get_series_result_type(mgr, self.objs)
370+
return cons(mgr, name=name).__finalize__(self, method='concat')
379371

380372
# combine as columns in a frame
381373
else:

pandas/tests/internals/test_external_block.py

+38-3
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,26 @@
44
import numpy as np
55

66
import pandas as pd
7-
from pandas.core.internals import Block, BlockManager, SingleBlockManager
7+
from pandas.core.internals import (
8+
Block, BlockManager, SingleBlockManager, NonConsolidatableMixIn)
89

910

10-
class CustomBlock(Block):
11+
class CustomBlock(NonConsolidatableMixIn, Block):
12+
13+
_holder = np.ndarray
1114

1215
def formatting_values(self):
1316
return np.array(["Val: {}".format(i) for i in self.values])
1417

18+
def concat_same_type(self, to_concat, placement=None):
19+
"""
20+
Always concatenate disregarding self.ndim as the values are
21+
always 1D in this custom Block
22+
"""
23+
values = np.concatenate([blk.values for blk in to_concat])
24+
return self.make_block_same_class(
25+
values, placement=placement or slice(0, len(values), 1))
26+
1527

1628
def test_custom_repr():
1729
values = np.arange(3, dtype='int64')
@@ -23,7 +35,30 @@ def test_custom_repr():
2335
assert repr(s) == '0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64'
2436

2537
# dataframe
26-
block = CustomBlock(values.reshape(1, -1), placement=slice(0, 1))
38+
block = CustomBlock(values, placement=slice(0, 1))
2739
blk_mgr = BlockManager([block], [['col'], range(3)])
2840
df = pd.DataFrame(blk_mgr)
2941
assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2'
42+
43+
44+
def test_concat_series():
45+
# GH17728
46+
values = np.arange(3, dtype='int64')
47+
block = CustomBlock(values, placement=slice(0, 3))
48+
s = pd.Series(block, pd.RangeIndex(3), fastpath=True)
49+
50+
res = pd.concat([s, s])
51+
assert isinstance(res._data.blocks[0], CustomBlock)
52+
53+
54+
def test_concat_dataframe():
55+
# GH17728
56+
df = pd.DataFrame({'a': [1, 2, 3]})
57+
blocks = df._data.blocks
58+
values = np.arange(3, dtype='int64')
59+
custom_block = CustomBlock(values, placement=slice(1, 2))
60+
blocks = blocks + (custom_block, )
61+
block_manager = BlockManager(blocks, [pd.Index(['a', 'b']), df.index])
62+
df = pd.DataFrame(block_manager)
63+
res = pd.concat([df, df])
64+
assert isinstance(res._data.blocks[1], CustomBlock)

0 commit comments

Comments
 (0)