Skip to content

Commit ea1186d

Browse files
committed
ENH: support for ordered factors in groupby, close #292
1 parent 4655828 commit ea1186d

File tree

3 files changed

+45
-4
lines changed

3 files changed

+45
-4
lines changed

pandas/core/factor.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ class Factor(object):
2121
* labels : ndarray
2222
* levels : ndarray
2323
"""
24-
def __init__(self, labels, levels):
24+
def __init__(self, labels, levels, name=None):
2525
from pandas.core.index import _ensure_index
2626

2727
levels = _ensure_index(levels)
@@ -30,6 +30,7 @@ def __init__(self, labels, levels):
3030

3131
self.labels = labels
3232
self.levels = levels
33+
self.name = name
3334

3435
@classmethod
3536
def from_array(cls, data):
@@ -51,9 +52,10 @@ def __len__(self):
5152
return len(self.labels)
5253

5354
def __repr__(self):
54-
temp = 'Factor:\n%s\nLevels (%d): %s'
55+
temp = 'Factor:%s\n%s\nLevels (%d): %s'
5556
values = np.asarray(self)
56-
return temp % (repr(values), len(self.levels), self.levels)
57+
return temp % ('' if self.name is None else self.name,
58+
repr(values), len(self.levels), self.levels)
5759

5860
def __getitem__(self, key):
5961
if isinstance(key, (int, np.integer)):

pandas/core/groupby.py

+12
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import types
33
import numpy as np
44

5+
from pandas.core.factor import Factor
56
from pandas.core.frame import DataFrame
67
from pandas.core.generic import NDFrame
78
from pandas.core.index import Index, MultiIndex, _ensure_index
@@ -972,6 +973,17 @@ def __init__(self, index, grouper=None, name=None, level=None,
972973
else:
973974
if isinstance(self.grouper, (list, tuple)):
974975
self.grouper = com._asarray_tuplesafe(self.grouper)
976+
elif isinstance(self.grouper, Factor):
977+
factor = self.grouper
978+
self._was_factor = True
979+
980+
# Is there any way to avoid this?
981+
self.grouper = np.asarray(factor)
982+
983+
self._labels = factor.labels
984+
self._group_index = factor.levels
985+
if self.name is None:
986+
self.name = factor.name
975987

976988
# no level passed
977989
if not isinstance(self.grouper, np.ndarray):

pandas/tests/test_groupby.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pandas import bdate_range
88
from pandas.core.index import Index, MultiIndex
99
from pandas.core.common import rands
10-
from pandas.core.frame import DataFrame
10+
from pandas.core.api import Factor, DataFrame
1111
from pandas.core.groupby import GroupByError
1212
from pandas.core.series import Series
1313
from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
@@ -1879,6 +1879,33 @@ def test_no_dummy_key_names(self):
18791879
self.df['B'].values]).sum()
18801880
self.assert_(result.index.names == [None, None])
18811881

1882+
def test_groupby_factor(self):
1883+
levels = ['foo', 'bar', 'baz', 'qux']
1884+
labels = np.random.randint(0, 4, size=100)
1885+
1886+
factor = Factor(labels, levels, name='myfactor')
1887+
1888+
data = DataFrame(np.random.randn(100, 4))
1889+
1890+
result = data.groupby(factor).mean()
1891+
1892+
expected = data.groupby(np.asarray(factor)).mean()
1893+
expected = expected.reindex(levels)
1894+
1895+
assert_frame_equal(result, expected)
1896+
self.assert_(result.index.name == factor.name)
1897+
1898+
grouped = data.groupby(factor)
1899+
desc_result = grouped.describe()
1900+
1901+
idx = factor.labels.argsort()
1902+
ord_labels = np.asarray(factor).take(idx)
1903+
ord_data = data.take(idx)
1904+
expected = ord_data.groupby(ord_labels, sort=False).describe()
1905+
assert_frame_equal(desc_result, expected)
1906+
1907+
1908+
18821909
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
18831910
tups = map(tuple, df[keys].values)
18841911
tups = com._asarray_tuplesafe(tups)

0 commit comments

Comments
 (0)