API: default empty DataFrame to dtype=object to prevent certain class of TypeError, e.g. out of empty SQL query. closes #1783

wesm · wesm · commit 852a99459c2a · 2012-09-18T13:21:43.000-04:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -64,6 +64,10 @@ pandas 0.9.0
     transposed. Legacy files will still be readable by HDFStore (#1834, #1824)
   - Legacy cruft removed: pandas.stats.misc.quantileTS
   - Use ISO8601 format for Period repr: monthly, daily, and on down (#1776)
+  - Empty DataFrame columns are now created as object dtype. This will prevent
+    a class of TypeErrors that was occurring in code where the dtype of a
+    column would depend on the presence of data or not (e.g. a SQL query having
+    results) (#1783)
 
 **Bug fixes**
 
@@ -184,6 +188,8 @@ pandas 0.9.0
     datetime.tzinfo without .zone and ._utcoffset attributes (#1922)
   - Fix DataFrame formatting of small, non-zero FP numbers (#1911)
   - Various fixes by upcasting of date -> datetime (#1395)
+  - Raise better exception when passing multiple functions with the same name,
+    such as lambdas, to GroupBy.aggregate
 
 pandas 0.8.1
 ============
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4994,7 +4994,12 @@ def _homogenize(data, index, columns, dtype=None):
             if dtype is not None and issubclass(dtype.type, np.integer):
                 continue
 
-            v = np.empty(len(index), dtype=dtype)
+            if dtype is None:
+                # #1783
+                v = np.empty(len(index), dtype=object)
+            else:
+                v = np.empty(len(index), dtype=dtype)
+
             v.fill(nan)
         else:
             v = data[k]
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -18,6 +18,11 @@
 class GroupByError(Exception):
     pass
 
+class DataError(GroupByError):
+    pass
+
+class SpecificationError(GroupByError):
+    pass
 
 def _groupby_function(name, alias, npfunc):
     def f(self):
@@ -290,7 +295,7 @@ def mean(self):
         """
         try:
             return self._cython_agg_general('mean')
-        except GroupByError:
+        except DataError:
             raise
         except Exception:  # pragma: no cover
             f = lambda x: x.mean(axis=self.axis)
@@ -304,7 +309,7 @@ def median(self):
         """
         try:
             return self._cython_agg_general('median')
-        except GroupByError:
+        except DataError:
             raise
         except Exception:  # pragma: no cover
             f = lambda x: x.median(axis=self.axis)
@@ -375,7 +380,7 @@ def _cython_agg_general(self, how):
             output[name] = result
 
         if len(output) == 0:
-            raise GroupByError('No numeric types to aggregate')
+            raise DataError('No numeric types to aggregate')
 
         return self._wrap_aggregated_output(output, names)
 
@@ -1270,6 +1275,10 @@ def _aggregate_multiple_funcs(self, arg):
         results = {}
 
         for name, func in arg:
+            if name in results:
+                raise SpecificationError('Function names must be unique, '
+                                         'found multiple named %s' % name)
+
             results[name] = self.aggregate(func)
 
         return DataFrame(results, columns=columns)
@@ -1415,7 +1424,7 @@ def _cython_agg_blocks(self, how):
             new_blocks.append(newb)
 
         if len(new_blocks) == 0:
-            raise GroupByError('No numeric types to aggregate')
+            raise DataError('No numeric types to aggregate')
 
         return new_blocks
 
@@ -1542,7 +1551,7 @@ def _aggregate_multiple_funcs(self, arg):
                                      grouper=self.grouper)
                 results.append(colg.aggregate(arg))
                 keys.append(col)
-            except (TypeError, GroupByError):
+            except (TypeError, DataError):
                 pass
 
         result = concat(results, keys=keys, axis=1)
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -951,7 +951,7 @@ def reindex_axis(self, new_axis, method=None, axis=0, copy=True):
                 result.axes[axis] = new_axis
 
                 if axis == 0:
-                    # patch ref_items
+                    # patch ref_items, #1823
                     for blk in result.blocks:
                         blk.ref_items = new_axis
 
@@ -1290,7 +1290,10 @@ def form_blocks(data, axes):
 
     if len(extra_items):
         shape = (len(extra_items),) + tuple(len(x) for x in axes[1:])
-        block_values = np.empty(shape, dtype=float)
+
+        # empty items -> dtype object
+        block_values = np.empty(shape, dtype=object)
+
         block_values.fill(nan)
 
         na_block = make_block(block_values, extra_items, items,
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -1593,12 +1593,12 @@ def test_constructor_dict(self):
         tm.assert_dict_equal(self.ts2, frame['col2'], compare_keys=False)
 
         frame = DataFrame({'col1' : self.ts1,
-                            'col2' : self.ts2},
+                           'col2' : self.ts2},
                            columns=['col2', 'col3', 'col4'])
 
         self.assertEqual(len(frame), len(self.ts2))
         self.assert_('col1' not in frame)
-        self.assert_(np.isnan(frame['col3']).all())
+        self.assert_(isnull(frame['col3']).all())
 
         # Corner cases
         self.assertEqual(len(DataFrame({})), 0)
@@ -1888,7 +1888,11 @@ def test_constructor_corner(self):
 
         # does not error but ends up float
         df = DataFrame(index=range(10), columns=['a','b'], dtype=int)
-        self.assert_(df.values.dtype == np.float64)
+        self.assert_(df.values.dtype == np.object_)
+
+        # #1783 empty dtype object
+        df = DataFrame({}, columns=['foo', 'bar'])
+        self.assert_(df.values.dtype == np.object_)
 
     def test_constructor_scalar_inference(self):
         data = {'int' : 1, 'bool' : True,
@@ -3305,7 +3309,9 @@ def test_to_csv_multiindex(self):
         recons = DataFrame.from_csv(path)
         exp = tsframe[:0]
         exp.index = []
-        assert_frame_equal(recons, exp)
+
+        self.assert_(recons.columns.equals(exp.columns))
+        self.assert_(len(recons) == 0)
 
     def test_to_csv_float32_nanrep(self):
         df = DataFrame(np.random.randn(1, 4).astype(np.float32))
@@ -6632,7 +6638,7 @@ def test_boolean_indexing(self):
 
     def test_sum_bools(self):
         df = DataFrame(index=range(1), columns=range(10))
-        bools = np.isnan(df)
+        bools = isnull(df)
         self.assert_(bools.sum(axis=1)[0] == 10)
 
     def test_fillna_col_reordering(self):
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -8,7 +8,7 @@
 from pandas.core.index import Index, MultiIndex
 from pandas.core.common import rands
 from pandas.core.api import Categorical, DataFrame
-from pandas.core.groupby import GroupByError
+from pandas.core.groupby import GroupByError, SpecificationError, DataError
 from pandas.core.series import Series
 from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
                                  assert_series_equal, assert_almost_equal)
@@ -252,11 +252,10 @@ def test_agg_apply_corner(self):
 
         # DataFrame
         grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
-        assert_frame_equal(grouped.sum(),
-                           DataFrame(columns=self.tsframe.columns))
-        assert_frame_equal(grouped.agg(np.sum),
-                           DataFrame(columns=self.tsframe.columns))
-        assert_frame_equal(grouped.apply(np.sum), DataFrame({}))
+        exp_df = DataFrame(columns=self.tsframe.columns, dtype=float)
+        assert_frame_equal(grouped.sum(), exp_df)
+        assert_frame_equal(grouped.agg(np.sum), exp_df)
+        assert_frame_equal(grouped.apply(np.sum), DataFrame({}, dtype=float))
 
     def test_agg_grouping_is_list_tuple(self):
         from pandas.core.groupby import Grouping
@@ -1078,11 +1077,11 @@ def test_cython_agg_boolean(self):
     def test_cython_agg_nothing_to_agg(self):
         frame = DataFrame({'a': np.random.randint(0, 5, 50),
                            'b': ['foo', 'bar'] * 25})
-        self.assertRaises(GroupByError, frame.groupby('a')['b'].mean)
+        self.assertRaises(DataError, frame.groupby('a')['b'].mean)
 
         frame = DataFrame({'a': np.random.randint(0, 5, 50),
                            'b': ['foo', 'bar'] * 25})
-        self.assertRaises(GroupByError, frame[['b']].groupby(frame['a']).mean)
+        self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean)
 
     def test_wrap_aggregated_output_multindex(self):
         df = self.mframe.T
@@ -1847,6 +1846,12 @@ def test_multiple_functions_tuples_and_non_tuples(self):
         expected = self.df.groupby('A').agg(ex_funcs)
         assert_frame_equal(result, expected)
 
+    def test_agg_multiple_functions_too_many_lambdas(self):
+        grouped = self.df.groupby('A')
+        funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]
+
+        self.assertRaises(SpecificationError, grouped.agg, funcs)
+
     def test_more_flexible_frame_multi_function(self):
         from pandas import concat