Skip to content

Commit 852a994

Browse files
committed
API: default empty DataFrame to dtype=object to prevent certain class of TypeError, e.g. out of empty SQL query. closes #1783
1 parent 5b033ce commit 852a994

File tree

6 files changed

+55
-21
lines changed

6 files changed

+55
-21
lines changed

RELEASE.rst

+6
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ pandas 0.9.0
6464
transposed. Legacy files will still be readable by HDFStore (#1834, #1824)
6565
- Legacy cruft removed: pandas.stats.misc.quantileTS
6666
- Use ISO8601 format for Period repr: monthly, daily, and on down (#1776)
67+
- Empty DataFrame columns are now created as object dtype. This will prevent
68+
a class of TypeErrors that was occurring in code where the dtype of a
69+
column would depend on the presence of data or not (e.g. a SQL query having
70+
results) (#1783)
6771

6872
**Bug fixes**
6973

@@ -184,6 +188,8 @@ pandas 0.9.0
184188
datetime.tzinfo without .zone and ._utcoffset attributes (#1922)
185189
- Fix DataFrame formatting of small, non-zero FP numbers (#1911)
186190
- Various fixes by upcasting of date -> datetime (#1395)
191+
- Raise better exception when passing multiple functions with the same name,
192+
such as lambdas, to GroupBy.aggregate
187193

188194
pandas 0.8.1
189195
============

pandas/core/frame.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -4994,7 +4994,12 @@ def _homogenize(data, index, columns, dtype=None):
49944994
if dtype is not None and issubclass(dtype.type, np.integer):
49954995
continue
49964996

4997-
v = np.empty(len(index), dtype=dtype)
4997+
if dtype is None:
4998+
# #1783
4999+
v = np.empty(len(index), dtype=object)
5000+
else:
5001+
v = np.empty(len(index), dtype=dtype)
5002+
49985003
v.fill(nan)
49995004
else:
50005005
v = data[k]

pandas/core/groupby.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@
1818
class GroupByError(Exception):
1919
pass
2020

21+
class DataError(GroupByError):
22+
pass
23+
24+
class SpecificationError(GroupByError):
25+
pass
2126

2227
def _groupby_function(name, alias, npfunc):
2328
def f(self):
@@ -290,7 +295,7 @@ def mean(self):
290295
"""
291296
try:
292297
return self._cython_agg_general('mean')
293-
except GroupByError:
298+
except DataError:
294299
raise
295300
except Exception: # pragma: no cover
296301
f = lambda x: x.mean(axis=self.axis)
@@ -304,7 +309,7 @@ def median(self):
304309
"""
305310
try:
306311
return self._cython_agg_general('median')
307-
except GroupByError:
312+
except DataError:
308313
raise
309314
except Exception: # pragma: no cover
310315
f = lambda x: x.median(axis=self.axis)
@@ -375,7 +380,7 @@ def _cython_agg_general(self, how):
375380
output[name] = result
376381

377382
if len(output) == 0:
378-
raise GroupByError('No numeric types to aggregate')
383+
raise DataError('No numeric types to aggregate')
379384

380385
return self._wrap_aggregated_output(output, names)
381386

@@ -1270,6 +1275,10 @@ def _aggregate_multiple_funcs(self, arg):
12701275
results = {}
12711276

12721277
for name, func in arg:
1278+
if name in results:
1279+
raise SpecificationError('Function names must be unique, '
1280+
'found multiple named %s' % name)
1281+
12731282
results[name] = self.aggregate(func)
12741283

12751284
return DataFrame(results, columns=columns)
@@ -1415,7 +1424,7 @@ def _cython_agg_blocks(self, how):
14151424
new_blocks.append(newb)
14161425

14171426
if len(new_blocks) == 0:
1418-
raise GroupByError('No numeric types to aggregate')
1427+
raise DataError('No numeric types to aggregate')
14191428

14201429
return new_blocks
14211430

@@ -1542,7 +1551,7 @@ def _aggregate_multiple_funcs(self, arg):
15421551
grouper=self.grouper)
15431552
results.append(colg.aggregate(arg))
15441553
keys.append(col)
1545-
except (TypeError, GroupByError):
1554+
except (TypeError, DataError):
15461555
pass
15471556

15481557
result = concat(results, keys=keys, axis=1)

pandas/core/internals.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -951,7 +951,7 @@ def reindex_axis(self, new_axis, method=None, axis=0, copy=True):
951951
result.axes[axis] = new_axis
952952

953953
if axis == 0:
954-
# patch ref_items
954+
# patch ref_items, #1823
955955
for blk in result.blocks:
956956
blk.ref_items = new_axis
957957

@@ -1290,7 +1290,10 @@ def form_blocks(data, axes):
12901290

12911291
if len(extra_items):
12921292
shape = (len(extra_items),) + tuple(len(x) for x in axes[1:])
1293-
block_values = np.empty(shape, dtype=float)
1293+
1294+
# empty items -> dtype object
1295+
block_values = np.empty(shape, dtype=object)
1296+
12941297
block_values.fill(nan)
12951298

12961299
na_block = make_block(block_values, extra_items, items,

pandas/tests/test_frame.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -1593,12 +1593,12 @@ def test_constructor_dict(self):
15931593
tm.assert_dict_equal(self.ts2, frame['col2'], compare_keys=False)
15941594

15951595
frame = DataFrame({'col1' : self.ts1,
1596-
'col2' : self.ts2},
1596+
'col2' : self.ts2},
15971597
columns=['col2', 'col3', 'col4'])
15981598

15991599
self.assertEqual(len(frame), len(self.ts2))
16001600
self.assert_('col1' not in frame)
1601-
self.assert_(np.isnan(frame['col3']).all())
1601+
self.assert_(isnull(frame['col3']).all())
16021602

16031603
# Corner cases
16041604
self.assertEqual(len(DataFrame({})), 0)
@@ -1888,7 +1888,11 @@ def test_constructor_corner(self):
18881888

18891889
# does not error but ends up float
18901890
df = DataFrame(index=range(10), columns=['a','b'], dtype=int)
1891-
self.assert_(df.values.dtype == np.float64)
1891+
self.assert_(df.values.dtype == np.object_)
1892+
1893+
# #1783 empty dtype object
1894+
df = DataFrame({}, columns=['foo', 'bar'])
1895+
self.assert_(df.values.dtype == np.object_)
18921896

18931897
def test_constructor_scalar_inference(self):
18941898
data = {'int' : 1, 'bool' : True,
@@ -3305,7 +3309,9 @@ def test_to_csv_multiindex(self):
33053309
recons = DataFrame.from_csv(path)
33063310
exp = tsframe[:0]
33073311
exp.index = []
3308-
assert_frame_equal(recons, exp)
3312+
3313+
self.assert_(recons.columns.equals(exp.columns))
3314+
self.assert_(len(recons) == 0)
33093315

33103316
def test_to_csv_float32_nanrep(self):
33113317
df = DataFrame(np.random.randn(1, 4).astype(np.float32))
@@ -6632,7 +6638,7 @@ def test_boolean_indexing(self):
66326638

66336639
def test_sum_bools(self):
66346640
df = DataFrame(index=range(1), columns=range(10))
6635-
bools = np.isnan(df)
6641+
bools = isnull(df)
66366642
self.assert_(bools.sum(axis=1)[0] == 10)
66376643

66386644
def test_fillna_col_reordering(self):

pandas/tests/test_groupby.py

+13-8
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pandas.core.index import Index, MultiIndex
99
from pandas.core.common import rands
1010
from pandas.core.api import Categorical, DataFrame
11-
from pandas.core.groupby import GroupByError
11+
from pandas.core.groupby import GroupByError, SpecificationError, DataError
1212
from pandas.core.series import Series
1313
from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
1414
assert_series_equal, assert_almost_equal)
@@ -252,11 +252,10 @@ def test_agg_apply_corner(self):
252252

253253
# DataFrame
254254
grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
255-
assert_frame_equal(grouped.sum(),
256-
DataFrame(columns=self.tsframe.columns))
257-
assert_frame_equal(grouped.agg(np.sum),
258-
DataFrame(columns=self.tsframe.columns))
259-
assert_frame_equal(grouped.apply(np.sum), DataFrame({}))
255+
exp_df = DataFrame(columns=self.tsframe.columns, dtype=float)
256+
assert_frame_equal(grouped.sum(), exp_df)
257+
assert_frame_equal(grouped.agg(np.sum), exp_df)
258+
assert_frame_equal(grouped.apply(np.sum), DataFrame({}, dtype=float))
260259

261260
def test_agg_grouping_is_list_tuple(self):
262261
from pandas.core.groupby import Grouping
@@ -1078,11 +1077,11 @@ def test_cython_agg_boolean(self):
10781077
def test_cython_agg_nothing_to_agg(self):
10791078
frame = DataFrame({'a': np.random.randint(0, 5, 50),
10801079
'b': ['foo', 'bar'] * 25})
1081-
self.assertRaises(GroupByError, frame.groupby('a')['b'].mean)
1080+
self.assertRaises(DataError, frame.groupby('a')['b'].mean)
10821081

10831082
frame = DataFrame({'a': np.random.randint(0, 5, 50),
10841083
'b': ['foo', 'bar'] * 25})
1085-
self.assertRaises(GroupByError, frame[['b']].groupby(frame['a']).mean)
1084+
self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean)
10861085

10871086
def test_wrap_aggregated_output_multindex(self):
10881087
df = self.mframe.T
@@ -1847,6 +1846,12 @@ def test_multiple_functions_tuples_and_non_tuples(self):
18471846
expected = self.df.groupby('A').agg(ex_funcs)
18481847
assert_frame_equal(result, expected)
18491848

1849+
def test_agg_multiple_functions_too_many_lambdas(self):
1850+
grouped = self.df.groupby('A')
1851+
funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]
1852+
1853+
self.assertRaises(SpecificationError, grouped.agg, funcs)
1854+
18501855
def test_more_flexible_frame_multi_function(self):
18511856
from pandas import concat
18521857

0 commit comments

Comments
 (0)