diff --git a/doc/source/release.rst b/doc/source/release.rst index 90fbb620ea5fd..5fde8ed577f94 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -180,6 +180,7 @@ Bug Fixes - Bug in ``nanops.var`` with ``ddof=1`` and 1 elements would sometimes return ``inf`` rather than ``nan`` on some platforms (:issue:`6136`) - Bug in Series and DataFrame bar plots ignoring the ``use_index`` keyword (:issue:`6209`) + - Bug in groupby with mixed str/int under python3 fixed; ``argsort`` was failing (:issue:`6212`) pandas 0.13.0 ------------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 729d4e4059595..d82846bd8cafd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -10,6 +10,7 @@ import pandas.algos as algos import pandas.hashtable as htable import pandas.compat as compat +from pandas.compat import filter, string_types def match(to_match, values, na_sentinel=-1): """ @@ -32,7 +33,7 @@ def match(to_match, values, na_sentinel=-1): match : ndarray of integers """ values = com._asarray_tuplesafe(values) - if issubclass(values.dtype.type, compat.string_types): + if issubclass(values.dtype.type, string_types): values = np.array(values, dtype='O') f = lambda htype, caster: _match_generic(to_match, values, htype, caster) @@ -143,7 +144,20 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): uniques = uniques.to_array() if sort and len(uniques) > 0: - sorter = uniques.argsort() + try: + sorter = uniques.argsort() + except: + # unorderable in py3 if mixed str/int + t = hash_klass(len(uniques)) + t.map_locations(com._ensure_object(uniques)) + + # order ints before strings + ordered = np.concatenate([ + np.sort(np.array([ e for i, e in enumerate(uniques) if f(e) ],dtype=object)) for f in [ lambda x: not isinstance(x,string_types), + lambda x: isinstance(x,string_types) ] + ]) + sorter = t.lookup(com._ensure_object(ordered)) + reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 3b53e8737d166..6a80c9f053c71 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2106,6 +2106,28 @@ def test_apply_with_mixed_dtype(self): result2 = df.groupby("c2", as_index=False).mean().c2 assert_series_equal(result1,result2) + def test_groupby_aggregation_mixed_dtype(self): + + # GH 6212 + expected = DataFrame({ + 'v1': [5,5,7,np.nan,3,3,4,1], + 'v2': [55,55,77,np.nan,33,33,44,11]}, + index=MultiIndex.from_tuples([(1,95),(1,99),(2,95),(2,99),('big','damp'), + ('blue','dry'),('red','red'),('red','wet')], + names=['by1','by2'])) + + df = DataFrame({ + 'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9], + 'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99], + 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, + np.nan] + }) + + g = df.groupby(['by1','by2']) + result = g[['v1','v2']].mean() + assert_frame_equal(result,expected) + def test_groupby_list_infer_array_like(self): result = self.df.groupby(list(self.df['A'])).mean() expected = self.df.groupby(self.df['A']).mean()