Skip to content

Commit f1ebaa3

Browse files
committed
Merge pull request #6222 from jreback/groupby_py3
BUG/TST: groupby with mixed string/int grouper failing in python3 (GH6212)
2 parents 847bf59 + 761fb23 commit f1ebaa3

File tree

3 files changed

+39
-2
lines changed

3 files changed

+39
-2
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ Bug Fixes
180180
- Bug in ``nanops.var`` with ``ddof=1`` and 1 elements would sometimes return ``inf``
181181
rather than ``nan`` on some platforms (:issue:`6136`)
182182
- Bug in Series and DataFrame bar plots ignoring the ``use_index`` keyword (:issue:`6209`)
183+
- Bug in groupby with mixed str/int under python3 fixed; ``argsort`` was failing (:issue:`6212`)
183184

184185
pandas 0.13.0
185186
-------------

pandas/core/algorithms.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pandas.algos as algos
1111
import pandas.hashtable as htable
1212
import pandas.compat as compat
13+
from pandas.compat import filter, string_types
1314

1415
def match(to_match, values, na_sentinel=-1):
1516
"""
@@ -32,7 +33,7 @@ def match(to_match, values, na_sentinel=-1):
3233
match : ndarray of integers
3334
"""
3435
values = com._asarray_tuplesafe(values)
35-
if issubclass(values.dtype.type, compat.string_types):
36+
if issubclass(values.dtype.type, string_types):
3637
values = np.array(values, dtype='O')
3738

3839
f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
@@ -143,7 +144,20 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
143144
uniques = uniques.to_array()
144145

145146
if sort and len(uniques) > 0:
146-
sorter = uniques.argsort()
147+
try:
148+
sorter = uniques.argsort()
149+
except:
150+
# unorderable in py3 if mixed str/int
151+
t = hash_klass(len(uniques))
152+
t.map_locations(com._ensure_object(uniques))
153+
154+
# order ints before strings
155+
ordered = np.concatenate([
156+
np.sort(np.array([ e for i, e in enumerate(uniques) if f(e) ],dtype=object)) for f in [ lambda x: not isinstance(x,string_types),
157+
lambda x: isinstance(x,string_types) ]
158+
])
159+
sorter = t.lookup(com._ensure_object(ordered))
160+
147161
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
148162
reverse_indexer.put(sorter, np.arange(len(sorter)))
149163

pandas/tests/test_groupby.py

+22
Original file line numberDiff line numberDiff line change
@@ -2106,6 +2106,28 @@ def test_apply_with_mixed_dtype(self):
21062106
result2 = df.groupby("c2", as_index=False).mean().c2
21072107
assert_series_equal(result1,result2)
21082108

2109+
def test_groupby_aggregation_mixed_dtype(self):
2110+
2111+
# GH 6212
2112+
expected = DataFrame({
2113+
'v1': [5,5,7,np.nan,3,3,4,1],
2114+
'v2': [55,55,77,np.nan,33,33,44,11]},
2115+
index=MultiIndex.from_tuples([(1,95),(1,99),(2,95),(2,99),('big','damp'),
2116+
('blue','dry'),('red','red'),('red','wet')],
2117+
names=['by1','by2']))
2118+
2119+
df = DataFrame({
2120+
'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9],
2121+
'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99],
2122+
'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
2123+
'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan,
2124+
np.nan]
2125+
})
2126+
2127+
g = df.groupby(['by1','by2'])
2128+
result = g[['v1','v2']].mean()
2129+
assert_frame_equal(result,expected)
2130+
21092131
def test_groupby_list_infer_array_like(self):
21102132
result = self.df.groupby(list(self.df['A'])).mean()
21112133
expected = self.df.groupby(self.df['A']).mean()

0 commit comments

Comments
 (0)