Skip to content

Commit 07f3914

Browse files
committed
ENH: first cuts on many-to-many joining, #249, #267
1 parent c2ce803 commit 07f3914

File tree

11 files changed

+936
-565
lines changed

11 files changed

+936
-565
lines changed

TODO.rst

+4-7
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
1-
LongPanel removal
2-
=================
3-
4-
- DONE level to flex methods
5-
- DONE level to reindex
6-
- ?? fast take for items
7-
1+
Join methods todo
2+
-----------------
3+
- Joint factorizer
4+
- NA group handling
85

96
DONE
107
----

bench/bench_merge.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from pandas import *
2+
import random
3+
4+
N = 10000
5+
ngroups = 3
6+
7+
def get_test_data(ngroups=100, n=N):
8+
unique_groups = range(ngroups)
9+
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
10+
11+
if len(arr) < n:
12+
arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
13+
dtype=object)
14+
15+
random.shuffle(arr)
16+
return arr
17+
18+
# aggregate multiple columns
19+
df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
20+
'key2' : get_test_data(ngroups=ngroups),
21+
'data1' : np.random.randn(N),
22+
'data2' : np.random.randn(N)})
23+
24+
df2 = DataFrame({'key1' : [0, 1, 2, 0, 1, 2],
25+
'key2' : [0, 1, 2, 0, 1, 2],
26+
'value' : list('abcdef')})
27+
28+
29+
import pandas.tools.merge as merge
30+
reload(merge)
31+
32+
left, right = merge._get_group_keys([df['key1'], df['key2']],
33+
[df2['key1'], df2['key2']])
34+
35+
left, right = merge._get_group_keys([df['key1']], [df2['key1']])
36+

pandas/core/index.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1790,7 +1790,8 @@ def insert(self, loc, item):
17901790
new_levels.append(level)
17911791
new_labels.append(np.insert(labels, loc, lev_loc))
17921792

1793-
return MultiIndex(levels=new_levels, labels=new_labels, names=self.names)
1793+
return MultiIndex(levels=new_levels, labels=new_labels,
1794+
names=self.names)
17941795

17951796
def delete(self, loc):
17961797
"""

pandas/core/internals.py

+11-23
Original file line numberDiff line numberDiff line change
@@ -1081,15 +1081,21 @@ def _union_items_slow(all_items):
10811081
return seen
10821082

10831083
def join_managers(left, right, axis=1, how='left', copy=True):
1084-
op = _JoinOperation(left, right, axis=axis, how=how)
1084+
join_index, left_indexer, right_indexer = \
1085+
left.axes[axis].join(right.axes[axis], how=how, return_indexers=True)
1086+
op = _JoinOperation(left, right, join_index, left_indexer,
1087+
right_indexer, axis=axis)
10851088
return op.get_result(copy=copy)
10861089

10871090
class _JoinOperation(object):
10881091
"""
10891092
Object responsible for orchestrating efficient join operation between two
10901093
BlockManager data structures
10911094
"""
1092-
def __init__(self, left, right, axis=1, how='left'):
1095+
def __init__(self, left, right, join_index, left_indexer, right_indexer,
1096+
axis=1):
1097+
assert(axis > 0)
1098+
10931099
if not left.is_consolidated():
10941100
left = left.consolidate()
10951101
if not right.is_consolidated():
@@ -1098,14 +1104,10 @@ def __init__(self, left, right, axis=1, how='left'):
10981104
self.left = left
10991105
self.right = right
11001106
self.axis = axis
1101-
self.how = how
1102-
1103-
laxis = left.axes[axis]
1104-
raxis = right.axes[axis]
11051107

1106-
(self.join_index,
1107-
self.lindexer,
1108-
self.rindexer) = laxis.join(raxis, how=how, return_indexers=True)
1108+
self.join_index = join_index
1109+
self.lindexer = left_indexer
1110+
self.rindexer = right_indexer
11091111

11101112
# do NOT sort
11111113
self.result_items = left.items.append(right.items)
@@ -1284,17 +1286,3 @@ def _upcast_blocks(blocks):
12841286

12851287
# use any ref_items
12861288
return _consolidate(new_blocks, newb.ref_items)
1287-
1288-
def _make_block_indexers(blocks, indexer, block_ids, block_locs, block_dtypes,
1289-
ref_items):
1290-
counts = defaultdict(int)
1291-
for dtype_name in block_dtypes.take(indexer):
1292-
counts[dtype_name] += 1
1293-
1294-
findexer = np.empty(counts['float64'], dtype='i4')
1295-
bindexer = np.empty(counts['bool'], dtype='i4')
1296-
oindexer = np.empty(counts['object'], dtype='i4')
1297-
iindexer = np.empty(counts['int64'], dtype='i4')
1298-
1299-
for idx in indexer:
1300-
pass

0 commit comments

Comments
 (0)