Skip to content

Commit 1317f58

Browse files
committed
ENH: refactor to add hierarchical index creation in concat. Tests needed still
1 parent 7d9e99c commit 1317f58

File tree

3 files changed

+68
-46
lines changed

3 files changed

+68
-46
lines changed

pandas/core/groupby.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -449,12 +449,15 @@ def _wrap_frames(self, keys, values, not_indexed_same=False):
449449
from pandas.tools.merge import concat, _concat_frames_hierarchical
450450

451451
if not_indexed_same:
452-
result = _concat_frames_hierarchical(values, keys,
453-
self.groupings,
454-
axis=self.axis)
452+
group_keys = keys
453+
group_levels = [ping.group_index for ping in self.groupings]
454+
group_names = [ping.name for ping in self.groupings]
455+
result = concat(values, axis=self.axis, keys=group_keys,
456+
levels=group_levels, names=group_names)
455457
else:
456-
result = concat(values, axis=0, verify_integrity=False)
457-
result = result.reindex(self.obj.index)
458+
result = concat(values, axis=self.axis)
459+
ax = self.obj._get_axis(self.axis)
460+
result = result.reindex_axis(ax, axis=self.axis)
458461

459462
return result
460463

pandas/tools/merge.py

Lines changed: 57 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -30,33 +30,6 @@ def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
3030
if __debug__: merge.__doc__ = _merge_doc % '\nleft : DataFrame'
3131

3232

33-
def concat(objs, axis=0, join='outer', join_axes=None,
34-
ignore_index=False, verify_integrity=False):
35-
"""
36-
Concatenate DataFrame objects row or column wise
37-
38-
Parameters
39-
----------
40-
objs : list of DataFrame objects
41-
axis : {0, 1}, default 0
42-
The axis to concatenate along
43-
join : {'inner', 'outer'}, default 'outer'
44-
How to handle indexes on other axis(es)
45-
join_index : index-like
46-
verify_integrity : boolean, default False
47-
Check whether the new concatenated axis contains duplicates. This can
48-
be very expensive relative to the actual data concatenation
49-
50-
Returns
51-
-------
52-
concatenated : DataFrame
53-
"""
54-
op = Concatenator(objs, axis=axis, join_axes=join_axes,
55-
ignore_index=ignore_index, join=join,
56-
verify_integrity=verify_integrity)
57-
return op.get_result()
58-
59-
6033

6134
# TODO: NA group handling
6235
# TODO: transformations??
@@ -614,14 +587,42 @@ def _get_all_block_kinds(blockmaps):
614587
#----------------------------------------------------------------------
615588
# Concatenate DataFrame objects
616589

590+
def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
591+
keys=None, names=None, levels=None, verify_integrity=False):
592+
"""
593+
Concatenate DataFrame objects row or column wise
594+
595+
Parameters
596+
----------
597+
objs : list of DataFrame objects
598+
axis : {0, 1}, default 0
599+
The axis to concatenate along
600+
join : {'inner', 'outer'}, default 'outer'
601+
How to handle indexes on other axis(es)
602+
join_index : index-like
603+
verify_integrity : boolean, default False
604+
Check whether the new concatenated axis contains duplicates. This can
605+
be very expensive relative to the actual data concatenation
606+
607+
Returns
608+
-------
609+
concatenated : DataFrame
610+
"""
611+
op = _Concatenator(objs, axis=axis, join_axes=join_axes,
612+
ignore_index=ignore_index, join=join,
613+
keys=keys, levels=levels, names=names,
614+
verify_integrity=verify_integrity)
615+
return op.get_result()
616+
617617

618-
class Concatenator(object):
618+
class _Concatenator(object):
619619
"""
620620
Orchestrates a concatenation operation for BlockManagers, with little hacks
621621
to support sparse data structures, etc.
622622
"""
623623

624624
def __init__(self, objs, axis=0, join='outer', join_axes=None,
625+
keys=None, levels=None, names=None,
625626
ignore_index=False, verify_integrity=False):
626627
if join == 'outer':
627628
self.intersect = False
@@ -645,6 +646,10 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
645646

646647
self.join_axes = join_axes
647648

649+
self.keys = keys
650+
self.names = names
651+
self.levels = levels
652+
648653
self.ignore_index = ignore_index
649654
self.verify_integrity = verify_integrity
650655

@@ -763,9 +768,7 @@ def _get_new_axes(self):
763768
if self.ignore_index:
764769
concat_axis = None
765770
else:
766-
concat_axis = _concat_indexes([x._data.axes[self.axis]
767-
for x in self.objs])
768-
self._maybe_check_integrity(concat_axis)
771+
concat_axis = self._get_concat_axis()
769772

770773
new_axes[self.axis] = concat_axis
771774

@@ -790,6 +793,19 @@ def _get_new_axes(self):
790793

791794
return new_axes
792795

796+
def _get_concat_axis(self):
797+
indexes = [x._data.axes[self.axis] for x in self.objs]
798+
799+
if self.keys is None:
800+
concat_axis = _concat_indexes(indexes)
801+
else:
802+
concat_axis = _make_concat_multiindex(indexes, self.keys,
803+
self.levels, self.names)
804+
805+
self._maybe_check_integrity(concat_axis)
806+
807+
return concat_axis
808+
793809
def _maybe_check_integrity(self, concat_index):
794810
if self.verify_integrity:
795811
if not concat_index._verify_integrity():
@@ -798,10 +814,7 @@ def _maybe_check_integrity(self, concat_index):
798814
% str(overlap))
799815

800816

801-
def _concat_frames_hierarchical(frames, keys, groupings, axis=0):
802-
names = [ping.name for ping in groupings]
803-
levels = [ping.group_index for ping in groupings]
804-
817+
def _concat_frames_hierarchical(frames, keys, names, levels, axis=0):
805818
if axis == 0:
806819
indexes = [x.index for x in frames]
807820
new_index = _make_concat_multiindex(indexes, keys, levels, names)
@@ -852,12 +865,7 @@ def _make_concat_multiindex(indexes, keys, levels, names):
852865
else:
853866
label_list.append(concat_index.values)
854867

855-
consensus_name = indexes[0].names
856-
for index in indexes[1:]:
857-
if index.names != consensus_name:
858-
consensus_name = [None] * index.nlevels
859-
break
860-
names.extend(consensus_name)
868+
names.extend(_get_consensus_names(indexes))
861869

862870
return MultiIndex.from_arrays(label_list, names=names)
863871

@@ -887,6 +895,14 @@ def _make_concat_multiindex(indexes, keys, levels, names):
887895
labels.append(np.tile(np.arange(n), len(indexes)))
888896
return MultiIndex(levels=new_levels, labels=labels, names=names)
889897

898+
def _get_consensus_names(indexes):
899+
consensus_name = indexes[0].names
900+
for index in indexes[1:]:
901+
if index.names != consensus_name:
902+
consensus_name = [None] * index.nlevels
903+
break
904+
return consensus_name
905+
890906
def _all_indexes_same(indexes):
891907
first = indexes[0]
892908
for index in indexes[1:]:

pandas/tools/tests/test_merge.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,9 @@ def test_append_missing_column_proper_upcast(self):
741741
self.assert_(appended['A'].dtype == 'f8')
742742
self.assert_(appended['B'].dtype == 'O')
743743

744+
def test_concat_with_group_keys(self):
745+
pass
746+
744747
def test_crossed_dtypes_weird_corner(self):
745748
columns = ['A', 'B', 'C', 'D']
746749
df1 = DataFrame({'A' : np.array([1, 2, 3, 4], dtype='f8'),

0 commit comments

Comments
 (0)