diff --git a/RELEASE.rst b/RELEASE.rst index eb2d30ca3e448..009bcb8c5d5d1 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -198,7 +198,7 @@ pandas 0.11.0 an irrecoverable state (GH3010_) - Bug in DataFrame update, combine_first where non-specified values could cause dtype changes (GH3016_, GH3041_) - - Bug in groupby with first/last where dtypes could change (GH3041_) + - Bug in groupby with first/last where dtypes could change (GH3041_, GH2763_) - Formatting of an index that has ``nan`` was inconsistent or wrong (would fill from other values), (GH2850_) - Unstack of a frame with no nans would always cause dtype upcasting (GH2929_) @@ -251,6 +251,7 @@ pandas 0.11.0 .. _GH2746: https://github.com/pydata/pandas/issues/2746 .. _GH2747: https://github.com/pydata/pandas/issues/2747 .. _GH2751: https://github.com/pydata/pandas/issues/2751 +.. _GH2763: https://github.com/pydata/pandas/issues/2763 .. _GH2776: https://github.com/pydata/pandas/issues/2776 .. _GH2778: https://github.com/pydata/pandas/issues/2778 .. _GH2787: https://github.com/pydata/pandas/issues/2787 diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0eb64834fe1aa..053deaa550b06 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -13,6 +13,7 @@ from pandas.util.compat import OrderedDict import pandas.core.algorithms as algos import pandas.core.common as com +from pandas.core.common import _possibly_downcast_to_dtype import pandas.lib as lib import pandas.algos as _algos @@ -440,14 +441,7 @@ def _try_cast(self, result, obj): # need to respect a non-number here (e.g. Decimal) if len(result) and issubclass(type(result[0]),(np.number,float,int)): - if issubclass(dtype.type, (np.integer, np.bool_)): - - # castable back to an int/bool as we don't have nans - if com.notnull(result).all(): - result = result.astype(dtype) - else: - - result = result.astype(dtype) + result = _possibly_downcast_to_dtype(result, dtype) elif issubclass(dtype.type, np.datetime64): if is_datetime64_dtype(obj.dtype): @@ -468,7 +462,7 @@ def _cython_agg_general(self, how, numeric_only=True): result, names = self.grouper.aggregate(obj.values, how) except AssertionError as e: raise GroupByError(str(e)) - output[name] = result + output[name] = self._try_cast(result, obj) if len(output) == 0: raise DataError('No numeric types to aggregate') diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index d609080b833bd..9e623de5483ab 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -91,48 +91,51 @@ def setUp(self): 'F': np.random.randn(11)}) def test_basic(self): - data = Series(np.arange(9) // 3, index=np.arange(9)) - index = np.arange(9) - np.random.shuffle(index) - data = data.reindex(index) + def checkit(dtype): + data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) - grouped = data.groupby(lambda x: x // 3) + index = np.arange(9) + np.random.shuffle(index) + data = data.reindex(index) - for k, v in grouped: - self.assertEqual(len(v), 3) + grouped = data.groupby(lambda x: x // 3) - agged = grouped.aggregate(np.mean) - self.assertEqual(agged[1], 1) + for k, v in grouped: + self.assertEqual(len(v), 3) - assert_series_equal(agged, grouped.agg(np.mean)) # shorthand - assert_series_equal(agged, grouped.mean()) + agged = grouped.aggregate(np.mean) + self.assertEqual(agged[1], 1) - # Cython only returning floating point for now... - assert_series_equal(grouped.agg(np.sum).astype(float), - grouped.sum()) + assert_series_equal(agged, grouped.agg(np.mean)) # shorthand + assert_series_equal(agged, grouped.mean()) + assert_series_equal(grouped.agg(np.sum),grouped.sum()) - transformed = grouped.transform(lambda x: x * x.sum()) - self.assertEqual(transformed[7], 12) + transformed = grouped.transform(lambda x: x * x.sum()) + self.assertEqual(transformed[7], 12) - value_grouped = data.groupby(data) - assert_series_equal(value_grouped.aggregate(np.mean), agged) + value_grouped = data.groupby(data) + assert_series_equal(value_grouped.aggregate(np.mean), agged) - # complex agg - agged = grouped.aggregate([np.mean, np.std]) - agged = grouped.aggregate({'one': np.mean, - 'two': np.std}) + # complex agg + agged = grouped.aggregate([np.mean, np.std]) + agged = grouped.aggregate({'one': np.mean, + 'two': np.std}) + + group_constants = { + 0: 10, + 1: 20, + 2: 30 + } + agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) + self.assertEqual(agged[1], 21) - group_constants = { - 0: 10, - 1: 20, - 2: 30 - } - agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) - self.assertEqual(agged[1], 21) + # corner cases + self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2) - # corner cases - self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2) + + for dtype in ['int64','int32','float64','float32']: + checkit(dtype) def test_first_last_nth(self): # tests for first / last / nth @@ -185,6 +188,14 @@ def test_first_last_nth_dtypes(self): expected.index = ['bar', 'foo'] assert_frame_equal(nth, expected, check_names=False) + # GH 2763, first/last shifting dtypes + idx = range(10) + idx.append(9) + s = Series(data=range(11), index=idx, name='IntCol') + self.assert_(s.dtype == 'int64') + f = s.groupby(level=0).first() + self.assert_(f.dtype == 'int64') + def test_grouper_iter(self): self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo']) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index c7d4f50649c34..f1594b154f2cc 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -71,9 +71,9 @@ def test_custom_grouper(self): idx = idx.append(dti[-1:]) expect = Series(arr, index=idx) - # cython returns float for now + # GH2763 - return in put dtype if we can result = g.agg(np.sum) - assert_series_equal(result, expect.astype(float)) + assert_series_equal(result, expect) data = np.random.rand(len(dti), 10) df = DataFrame(data, index=dti)