Skip to content

Commit c375fa3

Browse files
committed
ENH: implement inner join on in DataFrame.join, GH #248
1 parent ca5a702 commit c375fa3

File tree

4 files changed

+178
-148
lines changed

4 files changed

+178
-148
lines changed

pandas/core/frame.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2179,14 +2179,15 @@ def _get_raw_column(self, col):
21792179
def join(self, other, on=None, how=None, lsuffix='', rsuffix=''):
21802180
"""
21812181
Join columns with other DataFrame either on index or on a key
2182-
column
2182+
column.
21832183
21842184
Parameters
21852185
----------
21862186
other : DataFrame
21872187
Index should be similar to one of the columns in this one
21882188
on : string, default None
2189-
Column name to use, otherwise join on index
2189+
Column name to use, otherwise join on index. Just like an Excel
2190+
VLOOKUP operation
21902191
how : {'left', 'right', 'outer', 'inner'}
21912192
How to handle indexes of the two objects. Default: 'left'
21922193
for joining on index, None otherwise
@@ -2203,18 +2204,17 @@ def join(self, other, on=None, how=None, lsuffix='', rsuffix=''):
22032204
-------
22042205
joined : DataFrame
22052206
"""
2207+
if how is None:
2208+
how = 'left'
22062209
if on is not None:
2207-
if how is not None:
2208-
raise Exception('how parameter is not valid when '
2209-
'*on* specified')
2210-
return self._join_on(other, on, lsuffix, rsuffix)
2210+
return self._join_on(other, on, how, lsuffix, rsuffix)
22112211
else:
2212-
if how is None:
2213-
how = 'left'
2214-
22152212
return self._join_index(other, how, lsuffix, rsuffix)
22162213

2217-
def _join_on(self, other, on, lsuffix, rsuffix):
2214+
def _join_on(self, other, on, how, lsuffix, rsuffix):
2215+
if how not in ['left', 'inner']:
2216+
raise Exception('Only inner / left joins currently supported')
2217+
22182218
if isinstance(other, Series):
22192219
assert(other.name is not None)
22202220
other = DataFrame({other.name : other})
@@ -2232,7 +2232,7 @@ def _join_on(self, other, on, lsuffix, rsuffix):
22322232
else:
22332233
join_key = self[on].values
22342234

2235-
new_data = self._data.join_on(other._data, join_key, axis=1,
2235+
new_data = self._data.join_on(other._data, join_key, how=how, axis=1,
22362236
lsuffix=lsuffix, rsuffix=rsuffix)
22372237
return self._constructor(new_data)
22382238

pandas/core/internals.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -729,15 +729,24 @@ def _is_indexed_like(self, other):
729729
return False
730730
return True
731731

732-
def join_on(self, other, on, axis=1, lsuffix=None, rsuffix=None):
732+
def join_on(self, other, on, how='left', axis=1, lsuffix=None,
733+
rsuffix=None):
733734
this, other = self._maybe_rename_join(other, lsuffix, rsuffix)
734735

735736
other_axis = other.axes[axis]
736737
indexer = other_axis.get_indexer(on)
737738

738739
# TODO: deal with length-0 case? or does it fall out?
739-
mask = indexer == -1
740-
needs_masking = len(on) > 0 and mask.any()
740+
if how == 'left':
741+
mask = indexer == -1
742+
needs_masking = len(on) > 0 and mask.any()
743+
else:
744+
mask = indexer != -1
745+
this = this.take(mask.nonzero()[0], axis=axis)
746+
indexer = indexer[mask]
747+
mask = None
748+
needs_masking = False
749+
741750
other_blocks = []
742751
for block in other.blocks:
743752
newb = block.reindex_axis(indexer, mask, needs_masking, axis=axis)

pandas/core/sparse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1104,7 +1104,7 @@ def add_suffix(self, suffix):
11041104
f = ('%s' + ('%s' % suffix)).__mod__
11051105
return self.rename(columns=f)
11061106

1107-
def _join_on(self, other, on):
1107+
def _join_on(self, other, on, how, lsuffix, rsuffix):
11081108
# need to implement?
11091109
raise NotImplementedError
11101110

pandas/tests/test_frame.py

Lines changed: 154 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -2520,139 +2520,6 @@ def test_combineMult(self):
25202520
comb = self.empty.combineMult(self.frame)
25212521
assert_frame_equal(comb, self.frame)
25222522

2523-
def test_join_on(self):
2524-
index, data = tm.getMixedTypeDict()
2525-
target = DataFrame(data, index=index)
2526-
2527-
# Join on string value
2528-
source = DataFrame({'MergedA' : data['A'], 'MergedD' : data['D']},
2529-
index=data['C'])
2530-
merged = target.join(source, on='C')
2531-
self.assert_(np.array_equal(merged['MergedA'], target['A']))
2532-
self.assert_(np.array_equal(merged['MergedD'], target['D']))
2533-
2534-
# join with duplicates (fix regression from DataFrame/Matrix merge)
2535-
df = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c']})
2536-
df2 = DataFrame({'value' : [0, 1, 2]}, index=['a', 'b', 'c'])
2537-
joined = df.join(df2, on='key')
2538-
expected = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c'],
2539-
'value' : [0, 0, 1, 1, 2]})
2540-
assert_frame_equal(joined, expected)
2541-
2542-
# Test when some are missing
2543-
df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'],
2544-
columns=['one'])
2545-
df_b = DataFrame([['foo'], ['bar']], index=[1, 2],
2546-
columns=['two'])
2547-
df_c = DataFrame([[1], [2]], index=[1, 2],
2548-
columns=['three'])
2549-
joined = df_a.join(df_b, on='one')
2550-
joined = joined.join(df_c, on='one')
2551-
self.assert_(np.isnan(joined['two']['c']))
2552-
self.assert_(np.isnan(joined['three']['c']))
2553-
2554-
# merge column not p resent
2555-
self.assertRaises(Exception, target.join, source, on='E')
2556-
2557-
# nothing to merge
2558-
merged = target.join(source.reindex([]), on='C')
2559-
2560-
# overlap
2561-
source_copy = source.copy()
2562-
source_copy['A'] = 0
2563-
self.assertRaises(Exception, target.join, source_copy, on='A')
2564-
2565-
# can't specify how
2566-
self.assertRaises(Exception, target.join, source, on='C',
2567-
how='left')
2568-
2569-
def test_join_on_singlekey_list(self):
2570-
df = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c']})
2571-
df2 = DataFrame({'value' : [0, 1, 2]}, index=['a', 'b', 'c'])
2572-
2573-
# corner cases
2574-
joined = df.join(df2, on=['key'])
2575-
expected = df.join(df2, on='key')
2576-
2577-
assert_frame_equal(joined, expected)
2578-
2579-
def test_join_on_multikey(self):
2580-
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
2581-
['one', 'two', 'three']],
2582-
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
2583-
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
2584-
names=['first', 'second'])
2585-
to_join = DataFrame(np.random.randn(10, 3), index=index,
2586-
columns=['j_one', 'j_two', 'j_three'])
2587-
2588-
# a little relevant example with NAs
2589-
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
2590-
'qux', 'snap']
2591-
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
2592-
'three', 'one']
2593-
2594-
data = np.random.randn(len(key1))
2595-
data = DataFrame({'key1' : key1, 'key2' : key2,
2596-
'data' : data})
2597-
2598-
joined = data.join(to_join, on=['key1', 'key2'])
2599-
2600-
join_key = Index(zip(key1, key2))
2601-
indexer = to_join.index.get_indexer(join_key)
2602-
ex_values = to_join.values.take(indexer, axis=0)
2603-
ex_values[indexer == -1] = np.nan
2604-
expected = data.join(DataFrame(ex_values, columns=to_join.columns))
2605-
2606-
# TODO: columns aren't in the same order yet
2607-
assert_frame_equal(joined, expected.ix[:, joined.columns])
2608-
2609-
def test_join_index_mixed(self):
2610-
2611-
df1 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True},
2612-
index=np.arange(10),
2613-
columns=['A', 'B', 'C', 'D'])
2614-
self.assert_(df1['B'].dtype == np.int_)
2615-
self.assert_(df1['D'].dtype == np.bool_)
2616-
2617-
df2 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True},
2618-
index=np.arange(0, 10, 2),
2619-
columns=['A', 'B', 'C', 'D'])
2620-
2621-
# overlap
2622-
joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
2623-
expected_columns = ['A_one', 'B_one', 'C_one', 'D_one',
2624-
'A_two', 'B_two', 'C_two', 'D_two']
2625-
df1.columns = expected_columns[:4]
2626-
df2.columns = expected_columns[4:]
2627-
expected = _join_by_hand(df1, df2)
2628-
assert_frame_equal(joined, expected)
2629-
2630-
# no overlapping blocks
2631-
df1 = DataFrame(index=np.arange(10))
2632-
df1['bool'] = True
2633-
df1['string'] = 'foo'
2634-
2635-
df2 = DataFrame(index=np.arange(5, 15))
2636-
df2['int'] = 1
2637-
df2['float'] = 1.
2638-
2639-
for kind in JOIN_TYPES:
2640-
joined = df1.join(df2, how=kind)
2641-
expected = _join_by_hand(df1, df2, how=kind)
2642-
assert_frame_equal(joined, expected)
2643-
2644-
joined = df2.join(df1, how=kind)
2645-
expected = _join_by_hand(df2, df1, how=kind)
2646-
assert_frame_equal(joined, expected)
2647-
2648-
def test_join_on_series(self):
2649-
pass
2650-
2651-
def test_join_empty_bug(self):
2652-
# generated an exception in 0.4.3
2653-
x = DataFrame()
2654-
x.join(DataFrame([3], index=[0], columns=['A']), how='outer')
2655-
26562523
def test_clip(self):
26572524
median = self.frame.median().median()
26582525

@@ -3184,6 +3051,160 @@ def test_series_put_names(self):
31843051
for k, v in series.iteritems():
31853052
self.assertEqual(v.name, k)
31863053

3054+
3055+
3056+
class TestDataFrameJoin(unittest.TestCase):
3057+
3058+
def setUp(self):
3059+
index, data = tm.getMixedTypeDict()
3060+
self.target = DataFrame(data, index=index)
3061+
3062+
# Join on string value
3063+
self.source = DataFrame({'MergedA' : data['A'], 'MergedD' : data['D']},
3064+
index=data['C'])
3065+
3066+
def test_join_on(self):
3067+
target = self.target
3068+
source = self.source
3069+
3070+
merged = target.join(source, on='C')
3071+
self.assert_(np.array_equal(merged['MergedA'], target['A']))
3072+
self.assert_(np.array_equal(merged['MergedD'], target['D']))
3073+
3074+
# join with duplicates (fix regression from DataFrame/Matrix merge)
3075+
df = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c']})
3076+
df2 = DataFrame({'value' : [0, 1, 2]}, index=['a', 'b', 'c'])
3077+
joined = df.join(df2, on='key')
3078+
expected = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c'],
3079+
'value' : [0, 0, 1, 1, 2]})
3080+
assert_frame_equal(joined, expected)
3081+
3082+
# Test when some are missing
3083+
df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'],
3084+
columns=['one'])
3085+
df_b = DataFrame([['foo'], ['bar']], index=[1, 2],
3086+
columns=['two'])
3087+
df_c = DataFrame([[1], [2]], index=[1, 2],
3088+
columns=['three'])
3089+
joined = df_a.join(df_b, on='one')
3090+
joined = joined.join(df_c, on='one')
3091+
self.assert_(np.isnan(joined['two']['c']))
3092+
self.assert_(np.isnan(joined['three']['c']))
3093+
3094+
# merge column not p resent
3095+
self.assertRaises(Exception, target.join, source, on='E')
3096+
3097+
# overlap
3098+
source_copy = source.copy()
3099+
source_copy['A'] = 0
3100+
self.assertRaises(Exception, target.join, source_copy, on='A')
3101+
3102+
def test_join_with_len0(self):
3103+
# nothing to merge
3104+
merged = self.target.join(self.source.reindex([]), on='C')
3105+
for col in self.source:
3106+
self.assert_(col in merged)
3107+
self.assert_(merged[col].isnull().all())
3108+
3109+
def test_join_on_inner(self):
3110+
df = DataFrame({'key' : ['a', 'a', 'd', 'b', 'b', 'c']})
3111+
df2 = DataFrame({'value' : [0, 1]}, index=['a', 'b'])
3112+
3113+
joined = df.join(df2, on='key', how='inner')
3114+
3115+
expected = df.join(df2, on='key')
3116+
expected = expected[expected['value'].notnull()]
3117+
self.assert_(np.array_equal(joined['key'], expected['key']))
3118+
self.assert_(np.array_equal(joined['value'], expected['value']))
3119+
self.assert_(joined.index.equals(expected.index))
3120+
3121+
def test_join_on_singlekey_list(self):
3122+
df = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c']})
3123+
df2 = DataFrame({'value' : [0, 1, 2]}, index=['a', 'b', 'c'])
3124+
3125+
# corner cases
3126+
joined = df.join(df2, on=['key'])
3127+
expected = df.join(df2, on='key')
3128+
3129+
assert_frame_equal(joined, expected)
3130+
3131+
def test_join_on_multikey(self):
3132+
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
3133+
['one', 'two', 'three']],
3134+
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
3135+
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
3136+
names=['first', 'second'])
3137+
to_join = DataFrame(np.random.randn(10, 3), index=index,
3138+
columns=['j_one', 'j_two', 'j_three'])
3139+
3140+
# a little relevant example with NAs
3141+
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
3142+
'qux', 'snap']
3143+
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
3144+
'three', 'one']
3145+
3146+
data = np.random.randn(len(key1))
3147+
data = DataFrame({'key1' : key1, 'key2' : key2,
3148+
'data' : data})
3149+
3150+
joined = data.join(to_join, on=['key1', 'key2'])
3151+
3152+
join_key = Index(zip(key1, key2))
3153+
indexer = to_join.index.get_indexer(join_key)
3154+
ex_values = to_join.values.take(indexer, axis=0)
3155+
ex_values[indexer == -1] = np.nan
3156+
expected = data.join(DataFrame(ex_values, columns=to_join.columns))
3157+
3158+
# TODO: columns aren't in the same order yet
3159+
assert_frame_equal(joined, expected.ix[:, joined.columns])
3160+
3161+
def test_join_index_mixed(self):
3162+
3163+
df1 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True},
3164+
index=np.arange(10),
3165+
columns=['A', 'B', 'C', 'D'])
3166+
self.assert_(df1['B'].dtype == np.int_)
3167+
self.assert_(df1['D'].dtype == np.bool_)
3168+
3169+
df2 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True},
3170+
index=np.arange(0, 10, 2),
3171+
columns=['A', 'B', 'C', 'D'])
3172+
3173+
# overlap
3174+
joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
3175+
expected_columns = ['A_one', 'B_one', 'C_one', 'D_one',
3176+
'A_two', 'B_two', 'C_two', 'D_two']
3177+
df1.columns = expected_columns[:4]
3178+
df2.columns = expected_columns[4:]
3179+
expected = _join_by_hand(df1, df2)
3180+
assert_frame_equal(joined, expected)
3181+
3182+
# no overlapping blocks
3183+
df1 = DataFrame(index=np.arange(10))
3184+
df1['bool'] = True
3185+
df1['string'] = 'foo'
3186+
3187+
df2 = DataFrame(index=np.arange(5, 15))
3188+
df2['int'] = 1
3189+
df2['float'] = 1.
3190+
3191+
for kind in JOIN_TYPES:
3192+
joined = df1.join(df2, how=kind)
3193+
expected = _join_by_hand(df1, df2, how=kind)
3194+
assert_frame_equal(joined, expected)
3195+
3196+
joined = df2.join(df1, how=kind)
3197+
expected = _join_by_hand(df2, df1, how=kind)
3198+
assert_frame_equal(joined, expected)
3199+
3200+
def test_join_on_series(self):
3201+
pass
3202+
3203+
def test_join_empty_bug(self):
3204+
# generated an exception in 0.4.3
3205+
x = DataFrame()
3206+
x.join(DataFrame([3], index=[0], columns=['A']), how='outer')
3207+
31873208
def _join_by_hand(a, b, how='left'):
31883209
join_index = a.index.join(b.index, how=how)
31893210

0 commit comments

Comments
 (0)