diff --git a/pandas/core/common.py b/pandas/core/common.py index 08061b1d14863..a9b56b6905b6b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1527,17 +1527,22 @@ def _possibly_convert_objects(values, convert_dates=True, values, convert_datetime=convert_dates) # convert to numeric - if convert_numeric and values.dtype == np.object_: - try: - new_values = lib.maybe_convert_numeric( - values, set(), coerce_numeric=True) + if values.dtype == np.object_: + if convert_numeric: + try: + new_values = lib.maybe_convert_numeric( + values, set(), coerce_numeric=True) - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values - except: - pass + except: + pass + else: + + # soft-conversion + values = lib.maybe_convert_objects(values) return values diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 182f75e53ca5d..fb9b5e7831c88 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -22,6 +22,7 @@ notnull, _DATELIKE_DTYPES, is_numeric_dtype, is_timedelta64_dtype, is_datetime64_dtype) +from pandas import _np_version_under1p7 import pandas.lib as lib from pandas.lib import Timestamp import pandas.algos as _algos @@ -2243,16 +2244,19 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): try: if self.axis == 0: - stacked_values = np.vstack([np.asarray(x) - for x in values]) - columns = v.index - index = key_index + # normally use vstack as its faster than concat + # and if we have mi-columns + if not _np_version_under1p7 or isinstance(v.index,MultiIndex): + stacked_values = np.vstack([np.asarray(x) for x in values]) + result = DataFrame(stacked_values,index=key_index,columns=v.index) + else: + # GH5788 instead of stacking; concat gets the dtypes correct + from pandas.tools.merge import concat + result = concat(values,keys=key_index,names=key_index.names, + axis=self.axis).unstack() else: - stacked_values = np.vstack([np.asarray(x) - for x in values]).T - - index = v.index - columns = key_index + stacked_values = np.vstack([np.asarray(x) for x in values]) + result = DataFrame(stacked_values.T,index=v.index,columns=key_index) except (ValueError, AttributeError): # GH1738: values is list of arrays of unequal lengths fall @@ -2261,15 +2265,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here - cd = True - if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any(): - cd = 'coerce' - return DataFrame(stacked_values, index=index, - columns=columns).convert_objects(convert_dates=cd, convert_numeric=True) + cd = 'coerce' if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any() else True + return result.convert_objects(convert_dates=cd) else: - return Series(values, index=key_index).convert_objects( - convert_dates='coerce',convert_numeric=True) + # only coerce dates if we find at least 1 datetime + cd = 'coerce' if any([ isinstance(v,Timestamp) for v in values ]) else False + return Series(values, index=key_index).convert_objects(convert_dates=cd) + else: # Handle cases like BinGrouper return self._concat_objects(keys, values, diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4e657ca343c12..e76cf69eb420b 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3556,12 +3556,14 @@ def _consolidate_inplace(self): pass -def construction_error(tot_items, block_shape, axes): +def construction_error(tot_items, block_shape, axes, e=None): """ raise a helpful message about our construction """ - raise ValueError("Shape of passed values is %s, indices imply %s" % ( - tuple(map(int, [tot_items] + list(block_shape))), - tuple(map(int, [len(ax) for ax in axes])))) - + passed = tuple(map(int, [tot_items] + list(block_shape))) + implied = tuple(map(int, [len(ax) for ax in axes])) + if passed == implied and e is not None: + raise e + raise ValueError("Shape of passed values is {0}, indices imply {1}".format( + passed,implied)) def create_block_manager_from_blocks(blocks, axes): try: @@ -3576,10 +3578,10 @@ def create_block_manager_from_blocks(blocks, axes): mgr._consolidate_inplace() return mgr - except (ValueError): + except (ValueError) as e: blocks = [getattr(b, 'values', b) for b in blocks] tot_items = sum(b.shape[0] for b in blocks) - construction_error(tot_items, blocks[0].shape[1:], axes) + construction_error(tot_items, blocks[0].shape[1:], axes, e) def create_block_manager_from_arrays(arrays, names, axes): @@ -3588,8 +3590,8 @@ def create_block_manager_from_arrays(arrays, names, axes): mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() return mgr - except (ValueError): - construction_error(len(arrays), arrays[0].shape[1:], axes) + except (ValueError) as e: + construction_error(len(arrays), arrays[0].shape[1:], axes, e) def maybe_create_block_in_items_map(im, block): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 7e54aa4e0813f..9d7e90e5f8f32 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -28,7 +28,7 @@ import pandas.core.nanops as nanops import pandas.util.testing as tm - +import pandas as pd def commonSetUp(self): self.dateRange = bdate_range('1/1/2005', periods=250) @@ -481,6 +481,36 @@ def test_apply_describe_bug(self): grouped = self.mframe.groupby(level='first') result = grouped.describe() # it works! + def test_apply_issues(self): + # GH 5788 + + s="""2011.05.16,00:00,1.40893 +2011.05.16,01:00,1.40760 +2011.05.16,02:00,1.40750 +2011.05.16,03:00,1.40649 +2011.05.17,02:00,1.40893 +2011.05.17,03:00,1.40760 +2011.05.17,04:00,1.40750 +2011.05.17,05:00,1.40649 +2011.05.18,02:00,1.40893 +2011.05.18,03:00,1.40760 +2011.05.18,04:00,1.40750 +2011.05.18,05:00,1.40649""" + + df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'], parse_dates=[['date', 'time']]) + df = df.set_index('date_time') + + expected = df.groupby(df.index.date).idxmax() + result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) + assert_frame_equal(result,expected) + + # GH 5789 + # don't auto coerce dates + df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value']) + expected = Series(['00:00','02:00','02:00'],index=['2011.05.16','2011.05.17','2011.05.18']) + result = df.groupby('date').apply(lambda x: x['time'][x['value'].idxmax()]) + assert_series_equal(result,expected) + def test_len(self): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year,