Skip to content

Commit 78d090a

Browse files
committed
BUG: implement 64-bit int overflowing case in merge. close #2690
1 parent 5163bc2 commit 78d090a

File tree

3 files changed

+21
-6
lines changed

3 files changed

+21
-6
lines changed

RELEASE.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ pandas 0.10.1
5555
- Add ``logx`` option to DataFrame/Series.plot (GH2327_, #2565)
5656
- Support reading gzipped data from file-like object
5757
- ``pivot_table`` aggfunc can be anything used in GroupBy.aggregate (GH2643_)
58+
- Implement DataFrame merges in case where set cardinalities might overflow
59+
64-bit integer (GH2690_)
5860

5961
**Bug fixes**
6062

@@ -99,6 +101,7 @@ pandas 0.10.1
99101
.. _GH2625: https://github.com/pydata/pandas/issues/2625
100102
.. _GH2643: https://github.com/pydata/pandas/issues/2643
101103
.. _GH2637: https://github.com/pydata/pandas/issues/2637
104+
.. _GH2690: https://github.com/pydata/pandas/issues/2690
102105
.. _GH2692: https://github.com/pydata/pandas/issues/2692
103106

104107
pandas 0.10.0

pandas/tools/merge.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -425,18 +425,20 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'):
425425
right_labels.append(rlab)
426426
group_sizes.append(count)
427427

428-
left_group_key = get_group_index(left_labels, group_sizes)
429-
right_group_key = get_group_index(right_labels, group_sizes)
430-
431428
max_groups = 1L
432429
for x in group_sizes:
433430
max_groups *= long(x)
434431

435432
if max_groups > 2 ** 63: # pragma: no cover
436-
raise MergeError('Combinatorial explosion! (boom)')
433+
left_group_key, right_group_key, max_groups = \
434+
_factorize_keys(lib.fast_zip(left_labels),
435+
lib.fast_zip(right_labels))
436+
else:
437+
left_group_key = get_group_index(left_labels, group_sizes)
438+
right_group_key = get_group_index(right_labels, group_sizes)
437439

438-
left_group_key, right_group_key, max_groups = \
439-
_factorize_keys(left_group_key, right_group_key, sort=sort)
440+
left_group_key, right_group_key, max_groups = \
441+
_factorize_keys(left_group_key, right_group_key, sort=sort)
440442

441443
join_func = _join_functions[how]
442444
return join_func(left_group_key, right_group_key, max_groups)

pandas/tools/tests/test_merge.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,16 @@ def test_merge_na_keys(self):
849849

850850
tm.assert_frame_equal(result, expected)
851851

852+
def test_int64_overflow_issues(self):
853+
# #2690, combinatorial explosion
854+
df1 = DataFrame(np.random.randn(1000, 7),
855+
columns=list('ABCDEF') + ['G1'])
856+
df2 = DataFrame(np.random.randn(1000, 7),
857+
columns=list('ABCDEF') + ['G2'])
858+
859+
# it works!
860+
result = merge(df1, df2, how='outer')
861+
self.assertTrue(len(result) == 2000)
852862

853863
def _check_join(left, right, result, join_col, how='left',
854864
lsuffix='_x', rsuffix='_y'):

0 commit comments

Comments
 (0)