Skip to content

BUG: Bug in groupby with first/last where dtypes could change (GH3041_) #3044

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 14, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,9 @@ pandas 0.11.0
values (see GH2922_, GH2892_), also check for out-of-bounds indices (GH3029_)
- Bug in DataFrame column insertion when the column creation fails, existing frame is left in
an irrecoverable state (GH3010_)
- Bug in DataFrame update where non-specified values could cause dtype changes (GH3016_)
- Bug in DataFrame combine_first where non-specified values could cause dtype changes (GH3041_)
- Bug in DataFrame update, combine_first where non-specified values could cause
dtype changes (GH3016_, GH3041_)
- Bug in groupby with first/last where dtypes could change (GH3041_)
- Formatting of an index that has ``nan`` was inconsistent or wrong (would fill from
other values), (GH2850_)
- Unstack of a frame with no nans would always cause dtype upcasting (GH2929_)
Expand Down
24 changes: 23 additions & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,26 @@ def _possibly_cast_item(obj, item, dtype):
raise ValueError("Unexpected dtype encountered: %s" % dtype)


def _possibly_downcast_to_dtype(result, dtype):
""" try to cast to the specified dtype (e.g. convert back to bool/int
or could be an astype of float64->float32 """

if not isinstance(result, np.ndarray):
return result

try:
if dtype == np.float_:
return result.astype(dtype)
elif dtype == np.bool_ or dtype == np.int_:
if issubclass(result.dtype.type, np.number) and notnull(result).all():
new_result = result.astype(dtype)
if (new_result == result).all():
return new_result
except:
pass

return result

def _interp_wrapper(f, wrap_dtype, na_override=None):
def wrapper(arr, mask, limit=None):
view = arr.view(wrap_dtype)
Expand Down Expand Up @@ -936,7 +956,9 @@ def _possibly_convert_platform(values):
return values

def _possibly_cast_to_timedelta(value, coerce=True):
""" try to cast to timedelta64 w/o coercion """
""" try to cast to timedelta64, if already a timedeltalike, then make
sure that we are [ns] (as numpy 1.6.2 is very buggy in this regards,
don't force the conversion unless coerce is True """

# deal with numpy not being able to handle certain timedelta operations
if isinstance(value,np.ndarray) and value.dtype.kind == 'm':
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1594,6 +1594,10 @@ def _cython_agg_blocks(self, how, numeric_only=True):
values = com.ensure_float(values)

result, _ = self.grouper.aggregate(values, how, axis=agg_axis)

# see if we can cast the block back to the original dtype
result = block._try_cast_result(result)

newb = make_block(result, block.items, block.ref_items)
new_blocks.append(newb)

Expand Down
21 changes: 7 additions & 14 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from numpy import nan
import numpy as np

from pandas.core.common import _possibly_downcast_to_dtype
from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes
from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices
import pandas.core.common as com
Expand Down Expand Up @@ -560,6 +561,9 @@ class NumericBlock(Block):
is_numeric = True
_can_hold_na = True

def _try_cast_result(self, result):
return _possibly_downcast_to_dtype(result, self.dtype)

class FloatBlock(NumericBlock):

def _can_hold_element(self, element):
Expand Down Expand Up @@ -608,20 +612,6 @@ def _try_cast(self, element):
except: # pragma: no cover
return element

def _try_cast_result(self, result):
# this is quite restrictive to convert
try:
if (isinstance(result, np.ndarray) and
issubclass(result.dtype.type, np.floating)):
if com.notnull(result).all():
new_result = result.astype(self.dtype)
if (new_result == result).all():
return new_result
except:
pass

return result

def should_store(self, value):
return com.is_integer_dtype(value) and value.dtype == self.dtype

Expand All @@ -639,6 +629,9 @@ def _try_cast(self, element):
except: # pragma: no cover
return element

def _try_cast_result(self, result):
return _possibly_downcast_to_dtype(result, self.dtype)

def should_store(self, value):
return issubclass(value.dtype.type, np.bool_)

Expand Down
14 changes: 9 additions & 5 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,21 +163,25 @@ def test_first_last_nth(self):
self.assert_(com.isnull(grouped['B'].nth(0)['foo']))

def test_first_last_nth_dtypes(self):
# tests for first / last / nth

grouped = self.df_mixed_floats.groupby('A')
df = self.df_mixed_floats.copy()
df['E'] = True
df['F'] = 1

# tests for first / last / nth
grouped = df.groupby('A')
first = grouped.first()
expected = self.df_mixed_floats.ix[[1, 0], ['B', 'C', 'D']]
expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']]
expected.index = ['bar', 'foo']
assert_frame_equal(first, expected, check_names=False)

last = grouped.last()
expected = self.df_mixed_floats.ix[[5, 7], ['B', 'C', 'D']]
expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']]
expected.index = ['bar', 'foo']
assert_frame_equal(last, expected, check_names=False)

nth = grouped.nth(1)
expected = self.df_mixed_floats.ix[[3, 2], ['B', 'C', 'D']]
expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']]
expected.index = ['bar', 'foo']
assert_frame_equal(nth, expected, check_names=False)

Expand Down