diff --git a/doc/source/release.rst b/doc/source/release.rst index 9fa111d32e4bb..8f5308a90a0c7 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -562,6 +562,7 @@ Bug Fixes (:issue:`5102`). - Fixed a bug where ``groupby.plot()`` and friends were duplicating figures multiple times (:issue:`5102`). + - Provide automatic conversion of ``object`` dtypes on fillna, related (:issue:`5103`) pandas 0.12.0 diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 3b451e2a3b196..9abcdd8ea4780 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1177,7 +1177,7 @@ def convert(self, convert_dates=True, convert_numeric=True, copy=True, by_item=T # attempt to create new type blocks is_unique = self.items.is_unique blocks = [] - if by_item: + if by_item and not self._is_single_block: for i, c in enumerate(self.items): values = self.iget(i) @@ -1200,6 +1200,17 @@ def convert(self, convert_dates=True, convert_numeric=True, copy=True, by_item=T return blocks + def _maybe_downcast(self, blocks, downcast=None): + + if downcast is not None: + return blocks + + # split and convert the blocks + result_blocks = [] + for blk in blocks: + result_blocks.extend(blk.convert(convert_dates=True,convert_numeric=False)) + return result_blocks + def _can_hold_element(self, element): return True @@ -2050,6 +2061,8 @@ def apply(self, f, *args, **kwargs): result_blocks.extend(applied) else: result_blocks.append(applied) + if len(result_blocks) == 0: + return self.make_empty(axes or self.axes) bm = self.__class__( result_blocks, axes or self.axes, do_integrity_check=do_integrity_check) bm._consolidate_inplace() diff --git a/pandas/core/ops.py b/pandas/core/ops.py index c1c6e6e2f83d3..a10f3582bfe45 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -411,11 +411,13 @@ def na_op(x, y): result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: - result = pa.empty(len(x), dtype=x.dtype) if isinstance(y, (pa.Array, pd.Series)): + dtype = np.find_common_type([x.dtype,y.dtype],[]) + result = np.empty(x.size, dtype=dtype) mask = notnull(x) & notnull(y) result[mask] = op(x[mask], y[mask]) else: + result = pa.empty(len(x), dtype=x.dtype) mask = notnull(x) result[mask] = op(x[mask], y) @@ -690,12 +692,14 @@ def na_op(x, y): op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: xrav = x.ravel() - result = np.empty(x.size, dtype=x.dtype) if isinstance(y, (np.ndarray, pd.Series)): + dtype = np.find_common_type([x.dtype,y.dtype],[]) + result = np.empty(x.size, dtype=dtype) yrav = y.ravel() mask = notnull(xrav) & notnull(yrav) result[mask] = op(xrav[mask], yrav[mask]) else: + result = np.empty(x.size, dtype=x.dtype) mask = notnull(xrav) result[mask] = op(xrav[mask], y) @@ -855,6 +859,8 @@ def na_op(x, y): result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: + + # TODO: might need to find_common_type here? result = pa.empty(len(x), dtype=x.dtype) mask = notnull(x) result[mask] = op(x[mask], y) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index ff0e1b08d7247..0411934b9ef87 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1664,8 +1664,9 @@ def get_atom_string(self, block, itemsize): def set_atom_string( self, block, existing_col, min_itemsize, nan_rep, encoding): - # fill nan items with myself - block = block.fillna(nan_rep)[0] + # fill nan items with myself, don't disturb the blocks by + # trying to downcast + block = block.fillna(nan_rep, downcast=False)[0] data = block.values # see if we have a valid string type diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1e4e988431f43..9cb7483340817 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4311,15 +4311,16 @@ def test_operators_none_as_na(self): ops = [operator.add, operator.sub, operator.mul, operator.truediv] + # since filling converts dtypes from object, changed expected to be object for op in ops: filled = df.fillna(np.nan) result = op(df, 3) - expected = op(filled, 3) + expected = op(filled, 3).astype(object) expected[com.isnull(expected)] = None assert_frame_equal(result, expected) result = op(df, df) - expected = op(filled, filled) + expected = op(filled, filled).astype(object) expected[com.isnull(expected)] = None assert_frame_equal(result, expected) @@ -4327,7 +4328,7 @@ def test_operators_none_as_na(self): assert_frame_equal(result, expected) result = op(df.fillna(7), df) - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_dtype=False) def test_comparison_invalid(self): @@ -6695,6 +6696,25 @@ def test_fillna(self): df.fillna({ 2: 'foo' }, inplace=True) assert_frame_equal(df, expected) + def test_fillna_dtype_conversion(self): + # make sure that fillna on an empty frame works + df = DataFrame(index=["A","B","C"], columns = [1,2,3,4,5]) + result = df.get_dtype_counts().order() + expected = Series({ 'object' : 5 }) + assert_series_equal(result, expected) + + result = df.fillna(1) + expected = DataFrame(1, index=["A","B","C"], columns = [1,2,3,4,5]) + result = result.get_dtype_counts().order() + expected = Series({ 'int64' : 5 }) + assert_series_equal(result, expected) + + # empty block + df = DataFrame(index=lrange(3),columns=['A','B'],dtype='float64') + result = df.fillna('nan') + expected = DataFrame('nan',index=lrange(3),columns=['A','B']) + assert_frame_equal(result, expected) + def test_ffill(self): self.tsframe['A'][:5] = nan self.tsframe['A'][-5:] = nan @@ -10812,7 +10832,6 @@ def test_boolean_indexing_mixed(self): expected.loc[35,4] = 1 assert_frame_equal(df2,expected) - # add object, should this raise? df['foo'] = 'test' with tm.assertRaisesRegexp(TypeError, 'boolean setting on mixed-type'): df[df > 0.3] = 1 diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 5c94f378b88ea..07b33266d88a1 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -732,8 +732,9 @@ def test_logical_with_nas(self): expected = DataFrame({'a': [np.nan, True]}) assert_frame_equal(result, expected) + # this is autodowncasted here result = d['ItemA'].fillna(False) | d['ItemB'] - expected = DataFrame({'a': [True, True]}, dtype=object) + expected = DataFrame({'a': [True, True]}) assert_frame_equal(result, expected) def test_neg(self):