Merge pull request #5790 from jreback/apply_bugs

jreback · jreback · commit 14bc4450ab45 · 2013-12-29T08:23:21.000-08:00
BUG: dont' always coerce reductions in a groupby always to datetimes
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -1527,17 +1527,22 @@ def _possibly_convert_objects(values, convert_dates=True,
                 values, convert_datetime=convert_dates)
 
     # convert to numeric
-    if convert_numeric and values.dtype == np.object_:
-        try:
-            new_values = lib.maybe_convert_numeric(
-                values, set(), coerce_numeric=True)
+    if values.dtype == np.object_:
+        if convert_numeric:
+            try:
+                new_values = lib.maybe_convert_numeric(
+                    values, set(), coerce_numeric=True)
 
-            # if we are all nans then leave me alone
-            if not isnull(new_values).all():
-                values = new_values
+                # if we are all nans then leave me alone
+                if not isnull(new_values).all():
+                    values = new_values
 
-        except:
-            pass
+            except:
+                pass
+        else:
+
+            # soft-conversion
+            values = lib.maybe_convert_objects(values)
 
     return values
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -22,6 +22,7 @@
                                notnull, _DATELIKE_DTYPES, is_numeric_dtype,
                                is_timedelta64_dtype, is_datetime64_dtype)
 
+from pandas import _np_version_under1p7
 import pandas.lib as lib
 from pandas.lib import Timestamp
 import pandas.algos as _algos
@@ -2243,16 +2244,19 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                 try:
                     if self.axis == 0:
 
-                        stacked_values = np.vstack([np.asarray(x)
-                                                    for x in values])
-                        columns = v.index
-                        index = key_index
+                        # normally use vstack as its faster than concat
+                        # and if we have mi-columns
+                        if not _np_version_under1p7 or isinstance(v.index,MultiIndex):
+                            stacked_values = np.vstack([np.asarray(x) for x in values])
+                            result = DataFrame(stacked_values,index=key_index,columns=v.index)
+                        else:
+                            # GH5788 instead of stacking; concat gets the dtypes correct
+                            from pandas.tools.merge import concat
+                            result = concat(values,keys=key_index,names=key_index.names,
+                                            axis=self.axis).unstack()
                     else:
-                        stacked_values = np.vstack([np.asarray(x)
-                                                    for x in values]).T
-
-                        index = v.index
-                        columns = key_index
+                        stacked_values = np.vstack([np.asarray(x) for x in values])
+                        result = DataFrame(stacked_values.T,index=v.index,columns=key_index)
 
                 except (ValueError, AttributeError):
                     # GH1738: values is list of arrays of unequal lengths fall
@@ -2261,15 +2265,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
 
                 # if we have date/time like in the original, then coerce dates
                 # as we are stacking can easily have object dtypes here
-                cd = True
-                if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any():
-                    cd = 'coerce'
-                return DataFrame(stacked_values, index=index,
-                                 columns=columns).convert_objects(convert_dates=cd, convert_numeric=True)
+                cd = 'coerce' if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any() else True
+                return result.convert_objects(convert_dates=cd)
 
             else:
-                return Series(values, index=key_index).convert_objects(
-                    convert_dates='coerce',convert_numeric=True)
+                # only coerce dates if we find at least 1 datetime
+                cd = 'coerce' if any([ isinstance(v,Timestamp) for v in values ]) else False
+                return Series(values, index=key_index).convert_objects(convert_dates=cd)
+
         else:
             # Handle cases like BinGrouper
             return self._concat_objects(keys, values,
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -3556,12 +3556,14 @@ def _consolidate_inplace(self):
         pass
 
 
-def construction_error(tot_items, block_shape, axes):
+def construction_error(tot_items, block_shape, axes, e=None):
     """ raise a helpful message about our construction """
-    raise ValueError("Shape of passed values is %s, indices imply %s" % (
-        tuple(map(int, [tot_items] + list(block_shape))),
-        tuple(map(int, [len(ax) for ax in axes]))))
-
+    passed = tuple(map(int, [tot_items] + list(block_shape)))
+    implied = tuple(map(int, [len(ax) for ax in axes]))
+    if passed == implied and e is not None:
+        raise e
+    raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
+        passed,implied))
 
 def create_block_manager_from_blocks(blocks, axes):
     try:
@@ -3576,10 +3578,10 @@ def create_block_manager_from_blocks(blocks, axes):
         mgr._consolidate_inplace()
         return mgr
 
-    except (ValueError):
+    except (ValueError) as e:
         blocks = [getattr(b, 'values', b) for b in blocks]
         tot_items = sum(b.shape[0] for b in blocks)
-        construction_error(tot_items, blocks[0].shape[1:], axes)
+        construction_error(tot_items, blocks[0].shape[1:], axes, e)
 
 
 def create_block_manager_from_arrays(arrays, names, axes):
@@ -3588,8 +3590,8 @@ def create_block_manager_from_arrays(arrays, names, axes):
         mgr = BlockManager(blocks, axes)
         mgr._consolidate_inplace()
         return mgr
-    except (ValueError):
-        construction_error(len(arrays), arrays[0].shape[1:], axes)
+    except (ValueError) as e:
+        construction_error(len(arrays), arrays[0].shape[1:], axes, e)
 
 
 def maybe_create_block_in_items_map(im, block):
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -28,7 +28,7 @@
 import pandas.core.nanops as nanops
 
 import pandas.util.testing as tm
-
+import pandas as pd
 
 def commonSetUp(self):
     self.dateRange = bdate_range('1/1/2005', periods=250)
@@ -481,6 +481,36 @@ def test_apply_describe_bug(self):
         grouped = self.mframe.groupby(level='first')
         result = grouped.describe()  # it works!
 
+    def test_apply_issues(self):
+        # GH 5788
+
+        s="""2011.05.16,00:00,1.40893
+2011.05.16,01:00,1.40760
+2011.05.16,02:00,1.40750
+2011.05.16,03:00,1.40649
+2011.05.17,02:00,1.40893
+2011.05.17,03:00,1.40760
+2011.05.17,04:00,1.40750
+2011.05.17,05:00,1.40649
+2011.05.18,02:00,1.40893
+2011.05.18,03:00,1.40760
+2011.05.18,04:00,1.40750
+2011.05.18,05:00,1.40649"""
+
+        df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'], parse_dates=[['date', 'time']])
+        df = df.set_index('date_time')
+
+        expected = df.groupby(df.index.date).idxmax()
+        result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
+        assert_frame_equal(result,expected)
+
+        # GH 5789
+        # don't auto coerce dates
+        df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'])
+        expected = Series(['00:00','02:00','02:00'],index=['2011.05.16','2011.05.17','2011.05.18'])
+        result = df.groupby('date').apply(lambda x: x['time'][x['value'].idxmax()])
+        assert_series_equal(result,expected)
+
     def test_len(self):
         df = tm.makeTimeDataFrame()
         grouped = df.groupby([lambda x: x.year,