Skip to content

Commit 2842ad1

Browse files
committed
BUG: fix memory error in sortlevel when many multiindex levels. close #2684
1 parent d738b64 commit 2842ad1

File tree

3 files changed

+18
-1
lines changed

3 files changed

+18
-1
lines changed

RELEASE.rst

+3
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ pandas 0.10.1
104104
- Raise a more helpful error message in merge operations when one DataFrame
105105
has duplicate columns (GH2649_)
106106
- Fix partial date parsing issue occuring only when code is run at EOM (GH2618_)
107+
- Prevent MemoryError when using counting sort in sortlevel with
108+
high-cardinality MultiIndex objects (GH2684_)
107109

108110
**API Changes**
109111

@@ -133,6 +135,7 @@ pandas 0.10.1
133135
.. _GH2643: https://github.com/pydata/pandas/issues/2643
134136
.. _GH2649: https://github.com/pydata/pandas/issues/2649
135137
.. _GH2668: https://github.com/pydata/pandas/issues/2668
138+
.. _GH2684: https://github.com/pydata/pandas/issues/2684
136139
.. _GH2689: https://github.com/pydata/pandas/issues/2689
137140
.. _GH2690: https://github.com/pydata/pandas/issues/2690
138141
.. _GH2692: https://github.com/pydata/pandas/issues/2692

pandas/core/groupby.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -2244,7 +2244,12 @@ def _indexer_from_factorized(labels, shape, compress=True):
22442244
comp_ids = group_index
22452245
max_group = np.prod(shape)
22462246

2247-
indexer, _ = _algos.groupsort_indexer(comp_ids.astype(np.int64), max_group)
2247+
if max_group > 1e6:
2248+
# Use mergesort to avoid memory errors in counting sort
2249+
indexer = comp_ids.argsort(kind='mergesort')
2250+
else:
2251+
indexer, _ = _algos.groupsort_indexer(comp_ids.astype(np.int64),
2252+
max_group)
22482253

22492254
return indexer
22502255

pandas/tests/test_multilevel.py

+9
Original file line numberDiff line numberDiff line change
@@ -612,6 +612,15 @@ def test_sortlevel(self):
612612
rs.sortlevel(0, inplace=True)
613613
assert_frame_equal(rs, self.frame.sortlevel(0))
614614

615+
def test_sortlevel_large_cardinality(self):
616+
# #2684
617+
index = MultiIndex.from_arrays([np.arange(4000)]*3)
618+
df = DataFrame(np.random.randn(4000), index=index)
619+
620+
# it works!
621+
result = df.sortlevel(0)
622+
self.assertTrue(result.index.lexsort_depth == 3)
623+
615624
def test_delevel_infer_dtype(self):
616625
tuples = [tuple for tuple in cart_product(['foo', 'bar'],
617626
[10, 20], [1.0, 1.1])]

0 commit comments

Comments
 (0)