From dae37eddf09e9ae3b62d98b0e82fd63244a9fbd2 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 28 Dec 2013 22:57:13 -0500 Subject: [PATCH 1/4] BUG: dont' coerce reductions in a groupby always to datetimes; only when we have actual Timestamps in the data (GH5788,GH5789) --- pandas/core/groupby.py | 8 ++++++-- pandas/tests/test_groupby.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 182f75e53ca5d..e188270306771 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2268,8 +2268,12 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): columns=columns).convert_objects(convert_dates=cd, convert_numeric=True) else: - return Series(values, index=key_index).convert_objects( - convert_dates='coerce',convert_numeric=True) + # only coerce dates if we find at least 1 datetime + cd = False + if any([ isinstance(v,Timestamp) for v in values ]): + cd = 'coerce' + return Series(values, index=key_index).convert_objects(convert_dates=cd) + else: # Handle cases like BinGrouper return self._concat_objects(keys, values, diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 7e54aa4e0813f..9d7e90e5f8f32 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -28,7 +28,7 @@ import pandas.core.nanops as nanops import pandas.util.testing as tm - +import pandas as pd def commonSetUp(self): self.dateRange = bdate_range('1/1/2005', periods=250) @@ -481,6 +481,36 @@ def test_apply_describe_bug(self): grouped = self.mframe.groupby(level='first') result = grouped.describe() # it works! + def test_apply_issues(self): + # GH 5788 + + s="""2011.05.16,00:00,1.40893 +2011.05.16,01:00,1.40760 +2011.05.16,02:00,1.40750 +2011.05.16,03:00,1.40649 +2011.05.17,02:00,1.40893 +2011.05.17,03:00,1.40760 +2011.05.17,04:00,1.40750 +2011.05.17,05:00,1.40649 +2011.05.18,02:00,1.40893 +2011.05.18,03:00,1.40760 +2011.05.18,04:00,1.40750 +2011.05.18,05:00,1.40649""" + + df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'], parse_dates=[['date', 'time']]) + df = df.set_index('date_time') + + expected = df.groupby(df.index.date).idxmax() + result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) + assert_frame_equal(result,expected) + + # GH 5789 + # don't auto coerce dates + df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value']) + expected = Series(['00:00','02:00','02:00'],index=['2011.05.16','2011.05.17','2011.05.18']) + result = df.groupby('date').apply(lambda x: x['time'][x['value'].idxmax()]) + assert_series_equal(result,expected) + def test_len(self): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year, From 2a3dc240581a2b65001addf071c4a5f09ef79370 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 28 Dec 2013 23:57:40 -0500 Subject: [PATCH 2/4] INT: allow internal errors in block construction to bubble up --- pandas/core/internals.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4e657ca343c12..e76cf69eb420b 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3556,12 +3556,14 @@ def _consolidate_inplace(self): pass -def construction_error(tot_items, block_shape, axes): +def construction_error(tot_items, block_shape, axes, e=None): """ raise a helpful message about our construction """ - raise ValueError("Shape of passed values is %s, indices imply %s" % ( - tuple(map(int, [tot_items] + list(block_shape))), - tuple(map(int, [len(ax) for ax in axes])))) - + passed = tuple(map(int, [tot_items] + list(block_shape))) + implied = tuple(map(int, [len(ax) for ax in axes])) + if passed == implied and e is not None: + raise e + raise ValueError("Shape of passed values is {0}, indices imply {1}".format( + passed,implied)) def create_block_manager_from_blocks(blocks, axes): try: @@ -3576,10 +3578,10 @@ def create_block_manager_from_blocks(blocks, axes): mgr._consolidate_inplace() return mgr - except (ValueError): + except (ValueError) as e: blocks = [getattr(b, 'values', b) for b in blocks] tot_items = sum(b.shape[0] for b in blocks) - construction_error(tot_items, blocks[0].shape[1:], axes) + construction_error(tot_items, blocks[0].shape[1:], axes, e) def create_block_manager_from_arrays(arrays, names, axes): @@ -3588,8 +3590,8 @@ def create_block_manager_from_arrays(arrays, names, axes): mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() return mgr - except (ValueError): - construction_error(len(arrays), arrays[0].shape[1:], axes) + except (ValueError) as e: + construction_error(len(arrays), arrays[0].shape[1:], axes, e) def maybe_create_block_in_items_map(im, block): From dc1ca7bb9e83266f763bf2c723dbeaf460db690e Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 29 Dec 2013 00:03:48 -0500 Subject: [PATCH 3/4] BUG: resolved GH5788 under numpy < 1.7 because vstack is odd with M8[ns] --- pandas/core/groupby.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e188270306771..a7fbb3ccf807f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -22,6 +22,7 @@ notnull, _DATELIKE_DTYPES, is_numeric_dtype, is_timedelta64_dtype, is_datetime64_dtype) +from pandas import _np_version_under1p7 import pandas.lib as lib from pandas.lib import Timestamp import pandas.algos as _algos @@ -2243,16 +2244,19 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): try: if self.axis == 0: - stacked_values = np.vstack([np.asarray(x) - for x in values]) - columns = v.index - index = key_index + # normally use vstack as its faster than concat + # and if we have mi-columns + if not _np_version_under1p7 or isinstance(v.index,MultiIndex): + stacked_values = np.vstack([np.asarray(x) for x in values]) + result = DataFrame(stacked_values,index=key_index,columns=v.index) + else: + # GH5788 instead of stacking; concat gets the dtypes correct + from pandas.tools.merge import concat + result = concat(values,keys=key_index,names=key_index.names, + axis=self.axis).unstack() else: - stacked_values = np.vstack([np.asarray(x) - for x in values]).T - - index = v.index - columns = key_index + stacked_values = np.vstack([np.asarray(x) for x in values]) + result = DataFrame(stacked_values.T,index=v.index,columns=key_index) except (ValueError, AttributeError): # GH1738: values is list of arrays of unequal lengths fall @@ -2261,17 +2265,12 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here - cd = True - if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any(): - cd = 'coerce' - return DataFrame(stacked_values, index=index, - columns=columns).convert_objects(convert_dates=cd, convert_numeric=True) + cd = 'coerce' if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any() else True + return result.convert_objects(convert_dates=cd, convert_numeric=True) else: # only coerce dates if we find at least 1 datetime - cd = False - if any([ isinstance(v,Timestamp) for v in values ]): - cd = 'coerce' + cd = 'coerce' if any([ isinstance(v,Timestamp) for v in values ]) else False return Series(values, index=key_index).convert_objects(convert_dates=cd) else: From e375550b138af1318bc5c4d51c99658c67d554ed Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 29 Dec 2013 10:16:38 -0500 Subject: [PATCH 4/4] BUG: remove convert_numeric=True from groupbys perform soft-conversion of numeric dtypes instead --- pandas/core/common.py | 23 ++++++++++++++--------- pandas/core/groupby.py | 2 +- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 08061b1d14863..a9b56b6905b6b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1527,17 +1527,22 @@ def _possibly_convert_objects(values, convert_dates=True, values, convert_datetime=convert_dates) # convert to numeric - if convert_numeric and values.dtype == np.object_: - try: - new_values = lib.maybe_convert_numeric( - values, set(), coerce_numeric=True) + if values.dtype == np.object_: + if convert_numeric: + try: + new_values = lib.maybe_convert_numeric( + values, set(), coerce_numeric=True) - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values - except: - pass + except: + pass + else: + + # soft-conversion + values = lib.maybe_convert_objects(values) return values diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a7fbb3ccf807f..fb9b5e7831c88 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2266,7 +2266,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here cd = 'coerce' if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any() else True - return result.convert_objects(convert_dates=cd, convert_numeric=True) + return result.convert_objects(convert_dates=cd) else: # only coerce dates if we find at least 1 datetime