Skip to content

Commit ea0edfe

Browse files
committed
BUG: handle int64 overflows (suboptimally...) in groupby, sortlevel, sort_index, etc. GH #851
1 parent ba8172b commit ea0edfe

File tree

5 files changed

+193
-106
lines changed

5 files changed

+193
-106
lines changed

pandas/core/algorithms.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,6 @@ def match(values, index):
3030
return _match_generic(values, index, lib.PyObjectHashTable,
3131
com._ensure_object)
3232

33-
def _get_hash_table_and_cast(values):
34-
if com.is_float_dtype(values):
35-
klass = lib.Float64HashTable
36-
values = com._ensure_float64(values)
37-
elif com.is_integer_dtype(values):
38-
klass = lib.Int64HashTable
39-
values = com._ensure_int64(values)
40-
else:
41-
klass = lib.PyObjectHashTable
42-
values = com._ensure_object(values)
43-
return klass, values
44-
4533
def count(values, uniques=None):
4634
if uniques is not None:
4735
raise NotImplementedError
@@ -104,6 +92,20 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
10492

10593
return labels, uniques, counts
10694

95+
96+
def _get_hash_table_and_cast(values):
97+
if com.is_float_dtype(values):
98+
klass = lib.Float64HashTable
99+
values = com._ensure_float64(values)
100+
elif com.is_integer_dtype(values):
101+
klass = lib.Int64HashTable
102+
values = com._ensure_int64(values)
103+
else:
104+
klass = lib.PyObjectHashTable
105+
values = com._ensure_object(values)
106+
return klass, values
107+
108+
107109
def unique(values):
108110
"""
109111

pandas/core/frame.py

Lines changed: 2 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2270,6 +2270,8 @@ def sort_index(self, axis=0, by=None, ascending=True):
22702270
-------
22712271
sorted : DataFrame
22722272
"""
2273+
from pandas.core.groupby import _lexsort_indexer
2274+
22732275
labels = self._get_axis(axis)
22742276

22752277
if by is not None:
@@ -4293,37 +4295,6 @@ def complete_dataframe(obj, prev_completions):
42934295
pass
42944296

42954297

4296-
def _indexer_from_factorized(labels, shape, compress=True):
4297-
from pandas.core.groupby import get_group_index, _compress_group_index
4298-
4299-
group_index = get_group_index(labels, shape)
4300-
4301-
if compress:
4302-
comp_ids, obs_ids = _compress_group_index(group_index)
4303-
max_group = len(obs_ids)
4304-
else:
4305-
comp_ids = group_index
4306-
max_group = np.prod(shape)
4307-
4308-
indexer, _ = lib.groupsort_indexer(comp_ids.astype('i4'), max_group)
4309-
4310-
return indexer
4311-
4312-
4313-
def _lexsort_indexer(keys):
4314-
labels = []
4315-
shape = []
4316-
for key in keys:
4317-
rizer = lib.Factorizer(len(key))
4318-
4319-
if not key.dtype == np.object_:
4320-
key = key.astype('O')
4321-
4322-
ids, _ = rizer.factorize(key, sort=True)
4323-
labels.append(ids)
4324-
shape.append(len(rizer.uniques))
4325-
return _indexer_from_factorized(labels, shape)
4326-
43274298

43284299
if __name__ == '__main__':
43294300
import nose

pandas/core/groupby.py

Lines changed: 105 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from itertools import izip
22
import types
3-
43
import numpy as np
54

65
from pandas.core.frame import DataFrame
@@ -413,9 +412,10 @@ class Grouper(object):
413412
"""
414413
415414
"""
416-
def __init__(self, axis, groupings):
415+
def __init__(self, axis, groupings, sort=True):
417416
self.axis = axis
418417
self.groupings = groupings
418+
self.sort = sort
419419

420420
@property
421421
def shape(self):
@@ -507,19 +507,56 @@ def groups(self):
507507

508508
@cache_readonly
509509
def group_info(self):
510-
if len(self.groupings) > 1:
511-
all_labels = [ping.labels for ping in self.groupings]
512-
group_index = get_group_index(all_labels, self.shape)
513-
comp_ids, obs_group_ids = _compress_group_index(group_index)
514-
else:
515-
ping = self.groupings[0]
516-
group_index = ping.labels
510+
comp_ids, obs_group_ids = self._get_compressed_labels()
517511

518-
comp_ids, obs_group_ids = _compress_group_index(group_index)
519512
ngroups = len(obs_group_ids)
520513
comp_ids = com._ensure_int32(comp_ids)
521514
return comp_ids, obs_group_ids, ngroups
522515

516+
def _get_compressed_labels(self):
517+
all_labels = [ping.labels for ping in self.groupings]
518+
if self._overflow_possible:
519+
tups = lib.fast_zip(all_labels)
520+
labs, uniques, _ = algos.factorize(tups)
521+
522+
if self.sort:
523+
uniques, labs = _reorder_by_uniques(uniques, labs)
524+
525+
return labs, uniques
526+
else:
527+
if len(all_labels) > 1:
528+
group_index = get_group_index(all_labels, self.shape)
529+
else:
530+
group_index = all_labels[0]
531+
comp_ids, obs_group_ids = _compress_group_index(group_index)
532+
return comp_ids, obs_group_ids
533+
534+
@cache_readonly
535+
def _overflow_possible(self):
536+
return _int64_overflow_possible(self.shape)
537+
538+
@cache_readonly
539+
def result_index(self):
540+
recons = self.get_group_levels()
541+
return MultiIndex.from_arrays(recons, names=self.names)
542+
543+
def get_group_levels(self):
544+
obs_ids = self.group_info[1]
545+
if self._overflow_possible:
546+
recons_labels = [np.array(x) for x in izip(*obs_ids)]
547+
else:
548+
recons_labels = decons_group_index(obs_ids, self.shape)
549+
550+
name_list = []
551+
for ping, labels in zip(self.groupings, recons_labels):
552+
labels = com._ensure_platform_int(labels)
553+
name_list.append(ping.group_index.take(labels))
554+
555+
return name_list
556+
557+
#------------------------------------------------------------
558+
# Aggregation functions
559+
523560
_cython_functions = {
524561
'add' : lib.group_add,
525562
'mean' : lib.group_mean,
@@ -603,22 +640,6 @@ def _aggregate_series_pure_python(self, obj, func):
603640
result = lib.maybe_convert_objects(result, try_float=0)
604641
return result, counts
605642

606-
@cache_readonly
607-
def result_index(self):
608-
recons = self.get_group_levels()
609-
return MultiIndex.from_arrays(recons, names=self.names)
610-
611-
def get_group_levels(self):
612-
obs_ids = self.group_info[1]
613-
recons_labels = decons_group_index(obs_ids, self.shape)
614-
615-
name_list = []
616-
for ping, labels in zip(self.groupings, recons_labels):
617-
labels = com._ensure_platform_int(labels)
618-
name_list.append(ping.group_index.take(labels))
619-
620-
return name_list
621-
622643

623644
class Grouping(object):
624645
"""
@@ -793,7 +814,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
793814
ping.name = 'key_%d' % i
794815
groupings.append(ping)
795816

796-
grouper = Grouper(group_axis, groupings)
817+
grouper = Grouper(group_axis, groupings, sort=sort)
797818

798819
return grouper, exclusions
799820

@@ -1470,24 +1491,21 @@ def get_group_index(label_list, shape):
14701491
n = len(label_list[0])
14711492
group_index = np.zeros(n, dtype=np.int64)
14721493
mask = np.zeros(n, dtype=bool)
1473-
1474-
if _int64_overflow_possible(shape):
1475-
raise Exception('Possible int64 overflow, raise exception for now')
1476-
else:
1477-
for i in xrange(len(shape)):
1478-
stride = np.prod([x for x in shape[i+1:]], dtype=np.int64)
1479-
group_index += com._ensure_int64(label_list[i]) * stride
1480-
mask |= label_list[i] < 0
1494+
for i in xrange(len(shape)):
1495+
stride = np.prod([x for x in shape[i+1:]], dtype=np.int64)
1496+
group_index += com._ensure_int64(label_list[i]) * stride
1497+
mask |= label_list[i] < 0
14811498

14821499
np.putmask(group_index, mask, -1)
14831500
return group_index
14841501

1502+
_INT64_MAX = np.iinfo(np.int64).max
14851503
def _int64_overflow_possible(shape):
14861504
the_prod = 1L
14871505
for x in shape:
14881506
the_prod *= long(x)
14891507

1490-
return the_prod >= 2**63
1508+
return the_prod >= _INT64_MAX
14911509

14921510
def decons_group_index(comp_labels, shape):
14931511
# reconstruct labels
@@ -1504,6 +1522,39 @@ def decons_group_index(comp_labels, shape):
15041522
return label_list[::-1]
15051523

15061524

1525+
def _indexer_from_factorized(labels, shape, compress=True):
1526+
if _int64_overflow_possible(shape):
1527+
indexer = np.lexsort(np.array(labels[::-1]))
1528+
return indexer
1529+
1530+
group_index = get_group_index(labels, shape)
1531+
1532+
if compress:
1533+
comp_ids, obs_ids = _compress_group_index(group_index)
1534+
max_group = len(obs_ids)
1535+
else:
1536+
comp_ids = group_index
1537+
max_group = np.prod(shape)
1538+
1539+
indexer, _ = lib.groupsort_indexer(comp_ids.astype('i4'), max_group)
1540+
1541+
return indexer
1542+
1543+
1544+
def _lexsort_indexer(keys):
1545+
labels = []
1546+
shape = []
1547+
for key in keys:
1548+
rizer = lib.Factorizer(len(key))
1549+
1550+
if not key.dtype == np.object_:
1551+
key = key.astype('O')
1552+
1553+
ids, _ = rizer.factorize(key, sort=True)
1554+
labels.append(ids)
1555+
shape.append(len(rizer.uniques))
1556+
return _indexer_from_factorized(labels, shape)
1557+
15071558
class _KeyMapper(object):
15081559
"""
15091560
Ease my suffering. Map compressed group id -> key tuple
@@ -1548,23 +1599,29 @@ def _compress_group_index(group_index, sort=True):
15481599
obs_group_ids = np.array(uniques, dtype='i8')
15491600

15501601
if sort and len(obs_group_ids) > 0:
1551-
# sorter is index where elements ought to go
1552-
sorter = obs_group_ids.argsort()
1602+
obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids,
1603+
comp_ids)
15531604

1554-
# reverse_indexer is where elements came from
1555-
reverse_indexer = np.empty(len(sorter), dtype='i4')
1556-
reverse_indexer.put(sorter, np.arange(len(sorter)))
1605+
return comp_ids, obs_group_ids
15571606

1558-
mask = comp_ids < 0
1607+
def _reorder_by_uniques(uniques, labels):
1608+
# sorter is index where elements ought to go
1609+
sorter = uniques.argsort()
15591610

1560-
# move comp_ids to right locations (ie, unsort ascending labels)
1561-
comp_ids = reverse_indexer.take(comp_ids)
1562-
np.putmask(comp_ids, mask, -1)
1611+
# reverse_indexer is where elements came from
1612+
reverse_indexer = np.empty(len(sorter), dtype='i4')
1613+
reverse_indexer.put(sorter, np.arange(len(sorter)))
15631614

1564-
# sort observed ids
1565-
obs_group_ids = obs_group_ids.take(sorter)
1615+
mask = labels < 0
15661616

1567-
return comp_ids, obs_group_ids
1617+
# move labels to right locations (ie, unsort ascending labels)
1618+
labels = reverse_indexer.take(labels)
1619+
np.putmask(labels, mask, -1)
1620+
1621+
# sort observed ids
1622+
uniques = uniques.take(sorter)
1623+
1624+
return uniques, labels
15681625

15691626
def _groupby_indices(values):
15701627
if values.dtype != np.object_:

pandas/core/index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1578,7 +1578,7 @@ def sortlevel(self, level=0, ascending=True):
15781578
-------
15791579
sorted_index : MultiIndex
15801580
"""
1581-
from pandas.core.frame import _indexer_from_factorized
1581+
from pandas.core.groupby import _indexer_from_factorized
15821582

15831583
labels = list(self.labels)
15841584

0 commit comments

Comments
 (0)