Skip to content

Commit 35f3322

Browse files
committed
ENH: add multiple-join to DataFrame with new concat function, test append multiple, #115, #479, #273
1 parent 3b1c5b7 commit 35f3322

File tree

5 files changed

+102
-39
lines changed

5 files changed

+102
-39
lines changed

pandas/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,4 @@
3030
from pandas.util.testing import debug
3131

3232
from pandas.tools.pivot import pivot_table
33-
from pandas.tools.merge import merge
33+
from pandas.tools.merge import merge, concat

pandas/core/frame.py

+31-24
Original file line numberDiff line numberDiff line change
@@ -301,9 +301,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
301301

302302
def _init_mgr(self, mgr, index, columns, dtype=None, copy=False):
303303
if columns is not None:
304-
mgr = mgr.reindex_axis(columns, axis=0)
304+
mgr = mgr.reindex_axis(columns, axis=0, copy=False)
305305
if index is not None:
306-
mgr = mgr.reindex_axis(index, axis=1)
306+
mgr = mgr.reindex_axis(index, axis=1, copy=False)
307307
# do not copy BlockManager unless explicitly done
308308
if copy and dtype is None:
309309
mgr = mgr.copy()
@@ -2715,7 +2715,7 @@ def applymap(self, func):
27152715
#----------------------------------------------------------------------
27162716
# Merging / joining methods
27172717

2718-
def append(self, other, ignore_index=False):
2718+
def append(self, other, ignore_index=False, verify_integrity=True):
27192719
"""
27202720
Append columns of other to end of this frame's columns and index.
27212721
Columns not in this frame are added as new columns.
@@ -2749,19 +2749,20 @@ def append(self, other, ignore_index=False):
27492749
else:
27502750
to_concat = [self, other]
27512751
return concat(to_concat, ignore_index=ignore_index,
2752-
verify_integrity=True)
2752+
verify_integrity=verify_integrity)
27532753

27542754
def _get_raw_column(self, col):
27552755
return self._data.get(col)
27562756

27572757
def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
27582758
"""
27592759
Join columns with other DataFrame either on index or on a key
2760-
column.
2760+
column. Efficiently Join multiple DataFrame objects by index at once by
2761+
passing a list.
27612762
27622763
Parameters
27632764
----------
2764-
other : DataFrame, or Series with name field set
2765+
other : DataFrame, Series with name field set, or list of DataFrame
27652766
Index should be similar to one of the columns in this one. If a
27662767
Series is passed, its name attribute must be set, and that will be
27672768
used as the column name in the resulting joined DataFrame
@@ -2782,6 +2783,11 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
27822783
rsuffix : string
27832784
Suffix to use from right frame's overlapping columns
27842785
2786+
Notes
2787+
-----
2788+
on, lsuffix, and rsuffix options are not supported when passing a list
2789+
of DataFrame objects
2790+
27852791
Returns
27862792
-------
27872793
joined : DataFrame
@@ -2791,15 +2797,30 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
27912797
rsuffix=rsuffix)
27922798

27932799
def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix=''):
2794-
from pandas.tools.merge import merge
2800+
from pandas.tools.merge import merge, concat
27952801

27962802
if isinstance(other, Series):
27972803
assert(other.name is not None)
27982804
other = DataFrame({other.name : other})
27992805

2800-
return merge(self, other, left_on=on, how=how,
2801-
left_index=on is None, right_index=True,
2802-
suffixes=(lsuffix, rsuffix), sort=False)
2806+
if isinstance(other, DataFrame):
2807+
return merge(self, other, left_on=on, how=how,
2808+
left_index=on is None, right_index=True,
2809+
suffixes=(lsuffix, rsuffix), sort=False)
2810+
else:
2811+
if on is not None:
2812+
raise ValueError('Joining multiple DataFrames only supported'
2813+
' for joining on index')
2814+
2815+
# join indexes only using concat
2816+
if how == 'left':
2817+
how = 'outer'
2818+
join_index = self.index
2819+
else:
2820+
join_index = None
2821+
2822+
return concat([self] + list(other), axis=1, join=how,
2823+
join_index=join_index, verify_integrity=True)
28032824

28042825
def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
28052826
left_index=False, right_index=False, sort=True,
@@ -3623,20 +3644,6 @@ def extract_index(data):
36233644
return _ensure_index(index)
36243645

36253646

3626-
3627-
def _check_data_types(data):
3628-
have_raw_arrays = False
3629-
have_series = False
3630-
for v in data.values():
3631-
if not isinstance(v, (dict, Series)):
3632-
have_raw_arrays = True
3633-
else:
3634-
have_series = True
3635-
3636-
is_mixed = have_series and have_raw_arrays
3637-
return have_raw_arrays, is_mixed
3638-
3639-
36403647
def _prep_ndarray(values, copy=True):
36413648
if not isinstance(values, np.ndarray):
36423649
arr = np.asarray(values)

pandas/core/groupby.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,8 @@ def _wrap_frames(self, keys, values, not_indexed_same=False):
453453
self.groupings,
454454
axis=self.axis)
455455
else:
456-
result = concat(values, axis=0).reindex(self.obj.index)
456+
result = concat(values, axis=0, verify_integrity=False)
457+
result = result.reindex(self.obj.index)
457458

458459
return result
459460

@@ -1117,7 +1118,7 @@ def transform(self, func, *args, **kwargs):
11171118
>>> grouped = df.groupby(lambda x: mapping[x])
11181119
>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
11191120
"""
1120-
import pandas.tools.merge as merge
1121+
from pandas.tools.merge import concat
11211122

11221123
applied = []
11231124

@@ -1143,8 +1144,8 @@ def transform(self, func, *args, **kwargs):
11431144
applied.append(res)
11441145

11451146
concat_index = obj.columns if self.axis == 0 else obj.index
1146-
concatenated = merge.concat(applied, join_index=concat_index,
1147-
axis=self.axis)
1147+
concatenated = concat(applied, join_index=concat_index,
1148+
axis=self.axis, verify_integrity=False)
11481149
return concatenated.reindex_like(obj)
11491150

11501151
class PanelGroupBy(GroupBy):

pandas/tools/merge.py

+24-10
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ class _BlockJoinOperation(object):
385385
BlockManager data structures
386386
"""
387387
def __init__(self, data_list, join_index, indexers, axis=1, copy=True):
388-
if axis <= 0:
388+
if axis <= 0: # pragma: no cover
389389
raise Exception('Only axis >= 1 supported for this operation')
390390

391391
assert(len(data_list) == len(indexers))
@@ -587,49 +587,55 @@ def concat(frames, axis=0, join='outer', join_index=None,
587587
How to handle indexes on other axis
588588
join_index : index-like
589589
verify_integrity : boolean, default False
590+
Check whether the new concatenated axis contains duplicates. This can
591+
be very expensive relative to the actual data concatenation
590592
591593
Returns
592594
-------
593595
concatenated : DataFrame
594596
"""
595597
op = Concatenator(frames, axis=axis, join_index=join_index,
596-
ignore_index=ignore_index,
598+
ignore_index=ignore_index, join=join,
597599
verify_integrity=verify_integrity)
598600
return op.get_result()
599601

600602

601603
class Concatenator(object):
602604
"""
603-
605+
Orchestrates a concatenation operation with a list of DataFrame objects
604606
"""
605607

606608
def __init__(self, frames, axis=0, join='outer', join_index=None,
607609
ignore_index=False, verify_integrity=False):
610+
if join == 'outer':
611+
self.intersect = False
612+
elif join == 'inner':
613+
self.intersect = True
614+
else: # pragma: no cover
615+
raise ValueError('Only can inner (intersect) or outer (union) join '
616+
'the other axis')
608617

609618
# consolidate data
610619
for frame in frames:
611620
frame.consolidate(inplace=True)
612621

613622
self.frames = frames
614623
self.axis = axis
615-
self.join = join
616624
self.join_index = join_index
617625

618626
self.ignore_index = ignore_index
619-
620627
self.verify_integrity = verify_integrity
621-
622628
self.new_index, self.new_columns = self._get_new_axes()
623629

624630
def get_result(self):
625631
if len(self.frames) == 1:
626632
return self.frames[0]
627633

628634
new_data = self._get_concatenated_data()
629-
new_index, new_columns = self._get_new_axes()
630635
constructor = self._get_frame_constructor()
631636

632-
return constructor(new_data, index=new_index, columns=new_columns)
637+
return constructor(new_data, index=self.new_index,
638+
columns=self.new_columns)
633639

634640
def _get_concatenated_data(self):
635641
try:
@@ -717,9 +723,13 @@ def _get_new_axes(self):
717723

718724
if self.join_index is None:
719725
all_cols = [df.columns for df in self.frames]
720-
new_columns = _get_combined_index(all_cols, intersect=False)
726+
new_columns = _get_combined_index(all_cols,
727+
intersect=self.intersect)
721728
else:
722729
new_columns = self.join_index
730+
731+
self.frames = [df.reindex(columns=new_columns, copy=False)
732+
for df in self.frames]
723733
else:
724734
new_columns = _concat_indexes([df.columns for df in self.frames])
725735
self._maybe_check_integrity(new_columns)
@@ -730,10 +740,14 @@ def _get_new_axes(self):
730740

731741
if self.join_index is None:
732742
all_indexes = [df.index for df in self.frames]
733-
new_index = _get_combined_index(all_indexes, intersect=False)
743+
new_index = _get_combined_index(all_indexes,
744+
intersect=self.intersect)
734745
else:
735746
new_index = self.join_index
736747

748+
self.frames = [df.reindex(new_index, copy=False)
749+
for df in self.frames]
750+
737751
return new_index, new_columns
738752

739753
def _get_frame_constructor(self):

pandas/tools/tests/test_merge.py

+41
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,47 @@ def test_append_different_columns(self):
678678
self.assert_(isnull(appended['strings'][:5]).all())
679679
self.assert_(isnull(appended['bools'][5:]).all())
680680

681+
def test_append_many(self):
682+
chunks = [self.frame[:5], self.frame[5:10],
683+
self.frame[10:15], self.frame[15:]]
684+
685+
result = chunks[0].append(chunks[1:])
686+
tm.assert_frame_equal(result, self.frame)
687+
688+
chunks[-1]['foo'] = 'bar'
689+
result = chunks[0].append(chunks[1:])
690+
tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame)
691+
self.assert_((result['foo'][15:] == 'bar').all())
692+
self.assert_(result['foo'][:15].isnull().all())
693+
694+
def test_join_many(self):
695+
df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
696+
df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]
697+
698+
joined = df_list[0].join(df_list[1:])
699+
tm.assert_frame_equal(joined, df)
700+
701+
df_list = [df[['a', 'b']][:-2],
702+
df[['c', 'd']][2:], df[['e', 'f']][1:9]]
703+
704+
def _check_diff_index(df_list, result, exp_index):
705+
reindexed = [x.reindex(exp_index) for x in df_list]
706+
expected = reindexed[0].join(reindexed[1:])
707+
tm.assert_frame_equal(result, expected)
708+
709+
710+
# different join types
711+
joined = df_list[0].join(df_list[1:], how='outer')
712+
_check_diff_index(df_list, joined, df.index)
713+
714+
joined = df_list[0].join(df_list[1:])
715+
_check_diff_index(df_list, joined, df_list[0].index)
716+
717+
joined = df_list[0].join(df_list[1:], how='inner')
718+
_check_diff_index(df_list, joined, df.index[2:8])
719+
720+
self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a')
721+
681722
def test_append_missing_column_proper_upcast(self):
682723
pass
683724

0 commit comments

Comments
 (0)