Skip to content

Commit 465f054

Browse files
committed
Merge pull request #3044 from jreback/groupby_3016
BUG: Bug in groupby with first/last where dtypes could change (GH3041_)
2 parents 4b22372 + 5bc2973 commit 465f054

File tree

5 files changed

+46
-22
lines changed

5 files changed

+46
-22
lines changed

RELEASE.rst

+3-2
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,9 @@ pandas 0.11.0
145145
values (see GH2922_, GH2892_), also check for out-of-bounds indices (GH3029_)
146146
- Bug in DataFrame column insertion when the column creation fails, existing frame is left in
147147
an irrecoverable state (GH3010_)
148-
- Bug in DataFrame update where non-specified values could cause dtype changes (GH3016_)
149-
- Bug in DataFrame combine_first where non-specified values could cause dtype changes (GH3041_)
148+
- Bug in DataFrame update, combine_first where non-specified values could cause
149+
dtype changes (GH3016_, GH3041_)
150+
- Bug in groupby with first/last where dtypes could change (GH3041_)
150151
- Formatting of an index that has ``nan`` was inconsistent or wrong (would fill from
151152
other values), (GH2850_)
152153
- Unstack of a frame with no nans would always cause dtype upcasting (GH2929_)

pandas/core/common.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -777,6 +777,26 @@ def _possibly_cast_item(obj, item, dtype):
777777
raise ValueError("Unexpected dtype encountered: %s" % dtype)
778778

779779

780+
def _possibly_downcast_to_dtype(result, dtype):
781+
""" try to cast to the specified dtype (e.g. convert back to bool/int
782+
or could be an astype of float64->float32 """
783+
784+
if not isinstance(result, np.ndarray):
785+
return result
786+
787+
try:
788+
if dtype == np.float_:
789+
return result.astype(dtype)
790+
elif dtype == np.bool_ or dtype == np.int_:
791+
if issubclass(result.dtype.type, np.number) and notnull(result).all():
792+
new_result = result.astype(dtype)
793+
if (new_result == result).all():
794+
return new_result
795+
except:
796+
pass
797+
798+
return result
799+
780800
def _interp_wrapper(f, wrap_dtype, na_override=None):
781801
def wrapper(arr, mask, limit=None):
782802
view = arr.view(wrap_dtype)
@@ -936,7 +956,9 @@ def _possibly_convert_platform(values):
936956
return values
937957

938958
def _possibly_cast_to_timedelta(value, coerce=True):
939-
""" try to cast to timedelta64 w/o coercion """
959+
""" try to cast to timedelta64, if already a timedeltalike, then make
960+
sure that we are [ns] (as numpy 1.6.2 is very buggy in this regards,
961+
don't force the conversion unless coerce is True """
940962

941963
# deal with numpy not being able to handle certain timedelta operations
942964
if isinstance(value,np.ndarray) and value.dtype.kind == 'm':

pandas/core/groupby.py

+4
Original file line numberDiff line numberDiff line change
@@ -1594,6 +1594,10 @@ def _cython_agg_blocks(self, how, numeric_only=True):
15941594
values = com.ensure_float(values)
15951595

15961596
result, _ = self.grouper.aggregate(values, how, axis=agg_axis)
1597+
1598+
# see if we can cast the block back to the original dtype
1599+
result = block._try_cast_result(result)
1600+
15971601
newb = make_block(result, block.items, block.ref_items)
15981602
new_blocks.append(newb)
15991603

pandas/core/internals.py

+7-14
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from numpy import nan
55
import numpy as np
66

7+
from pandas.core.common import _possibly_downcast_to_dtype
78
from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes
89
from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices
910
import pandas.core.common as com
@@ -560,6 +561,9 @@ class NumericBlock(Block):
560561
is_numeric = True
561562
_can_hold_na = True
562563

564+
def _try_cast_result(self, result):
565+
return _possibly_downcast_to_dtype(result, self.dtype)
566+
563567
class FloatBlock(NumericBlock):
564568

565569
def _can_hold_element(self, element):
@@ -608,20 +612,6 @@ def _try_cast(self, element):
608612
except: # pragma: no cover
609613
return element
610614

611-
def _try_cast_result(self, result):
612-
# this is quite restrictive to convert
613-
try:
614-
if (isinstance(result, np.ndarray) and
615-
issubclass(result.dtype.type, np.floating)):
616-
if com.notnull(result).all():
617-
new_result = result.astype(self.dtype)
618-
if (new_result == result).all():
619-
return new_result
620-
except:
621-
pass
622-
623-
return result
624-
625615
def should_store(self, value):
626616
return com.is_integer_dtype(value) and value.dtype == self.dtype
627617

@@ -639,6 +629,9 @@ def _try_cast(self, element):
639629
except: # pragma: no cover
640630
return element
641631

632+
def _try_cast_result(self, result):
633+
return _possibly_downcast_to_dtype(result, self.dtype)
634+
642635
def should_store(self, value):
643636
return issubclass(value.dtype.type, np.bool_)
644637

pandas/tests/test_groupby.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -163,21 +163,25 @@ def test_first_last_nth(self):
163163
self.assert_(com.isnull(grouped['B'].nth(0)['foo']))
164164

165165
def test_first_last_nth_dtypes(self):
166-
# tests for first / last / nth
167166

168-
grouped = self.df_mixed_floats.groupby('A')
167+
df = self.df_mixed_floats.copy()
168+
df['E'] = True
169+
df['F'] = 1
170+
171+
# tests for first / last / nth
172+
grouped = df.groupby('A')
169173
first = grouped.first()
170-
expected = self.df_mixed_floats.ix[[1, 0], ['B', 'C', 'D']]
174+
expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']]
171175
expected.index = ['bar', 'foo']
172176
assert_frame_equal(first, expected, check_names=False)
173177

174178
last = grouped.last()
175-
expected = self.df_mixed_floats.ix[[5, 7], ['B', 'C', 'D']]
179+
expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']]
176180
expected.index = ['bar', 'foo']
177181
assert_frame_equal(last, expected, check_names=False)
178182

179183
nth = grouped.nth(1)
180-
expected = self.df_mixed_floats.ix[[3, 2], ['B', 'C', 'D']]
184+
expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']]
181185
expected.index = ['bar', 'foo']
182186
assert_frame_equal(nth, expected, check_names=False)
183187

0 commit comments

Comments
 (0)