Merge pull request #6222 from jreback/groupby_py3

jreback · jreback · commit f1ebaa31153d · 2014-02-01T10:12:17.000-08:00
BUG/TST: groupby with mixed string/int grouper failing in python3 (GH6212)
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -180,6 +180,7 @@ Bug Fixes
   - Bug in ``nanops.var`` with ``ddof=1`` and 1 elements would sometimes return ``inf``
     rather than ``nan`` on some platforms (:issue:`6136`)
   - Bug in Series and DataFrame bar plots ignoring the ``use_index`` keyword (:issue:`6209`)
+  - Bug in groupby with mixed str/int under python3 fixed; ``argsort`` was failing (:issue:`6212`)
 
 pandas 0.13.0
 -------------
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -10,6 +10,7 @@
 import pandas.algos as algos
 import pandas.hashtable as htable
 import pandas.compat as compat
+from pandas.compat import filter, string_types
 
 def match(to_match, values, na_sentinel=-1):
     """
@@ -32,7 +33,7 @@ def match(to_match, values, na_sentinel=-1):
     match : ndarray of integers
     """
     values = com._asarray_tuplesafe(values)
-    if issubclass(values.dtype.type, compat.string_types):
+    if issubclass(values.dtype.type, string_types):
         values = np.array(values, dtype='O')
 
     f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
@@ -143,7 +144,20 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
     uniques = uniques.to_array()
 
     if sort and len(uniques) > 0:
-        sorter = uniques.argsort()
+        try:
+            sorter = uniques.argsort()
+        except:
+            # unorderable in py3 if mixed str/int
+            t = hash_klass(len(uniques))
+            t.map_locations(com._ensure_object(uniques))
+
+            # order ints before strings
+            ordered = np.concatenate([
+                np.sort(np.array([ e for i, e in enumerate(uniques) if f(e) ],dtype=object)) for f in [ lambda x: not isinstance(x,string_types),
+                                                                                                        lambda x: isinstance(x,string_types) ]
+                ])
+            sorter = t.lookup(com._ensure_object(ordered))
+
         reverse_indexer = np.empty(len(sorter), dtype=np.int_)
         reverse_indexer.put(sorter, np.arange(len(sorter)))
 
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -2106,6 +2106,28 @@ def test_apply_with_mixed_dtype(self):
         result2 = df.groupby("c2", as_index=False).mean().c2
         assert_series_equal(result1,result2)
 
+    def test_groupby_aggregation_mixed_dtype(self):
+
+        # GH 6212
+        expected = DataFrame({
+            'v1': [5,5,7,np.nan,3,3,4,1],
+            'v2': [55,55,77,np.nan,33,33,44,11]},
+            index=MultiIndex.from_tuples([(1,95),(1,99),(2,95),(2,99),('big','damp'),
+                                          ('blue','dry'),('red','red'),('red','wet')],
+                                         names=['by1','by2']))
+
+        df = DataFrame({
+            'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9],
+            'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99],
+            'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
+            'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan,
+                    np.nan]
+            })
+
+        g = df.groupby(['by1','by2'])
+        result = g[['v1','v2']].mean()
+        assert_frame_equal(result,expected)
+
     def test_groupby_list_infer_array_like(self):
         result = self.df.groupby(list(self.df['A'])).mean()
         expected = self.df.groupby(self.df['A']).mean()