diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f6d5e3df814fc..cc13f39a47d5a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -886,3 +886,5 @@ Bug Fixes - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) + +- Concating multiple objects will no longer result in automatically upcast to `float64`, and instead try to find the smallest `dtype` that would suffice (:issue:`13247`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index aa954fbee9a60..1c070b3ed34a9 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -21,6 +21,7 @@ is_datetime64tz_dtype, is_object_dtype, is_datetimelike_v_numeric, + is_float_dtype, is_numeric_dtype, is_numeric_v_string_like, is_extension_type, is_list_like, is_re, @@ -4522,6 +4523,8 @@ def _interleaved_dtype(blocks): return np.dtype('int%s' % (lcd.itemsize * 8 * 2)) return lcd + elif have_int and have_float and not have_complex: + return np.dtype('float64') elif have_complex: return np.dtype('c16') else: @@ -4891,6 +4894,8 @@ def get_empty_dtype_and_na(join_units): upcast_cls = 'datetime' elif is_timedelta64_dtype(dtype): upcast_cls = 'timedelta' + elif is_float_dtype(dtype) or is_numeric_dtype(dtype): + upcast_cls = dtype.name else: upcast_cls = 'float' @@ -4915,8 +4920,6 @@ def get_empty_dtype_and_na(join_units): return np.dtype(np.bool_), None elif 'category' in upcast_classes: return np.dtype(np.object_), np.nan - elif 'float' in upcast_classes: - return np.dtype(np.float64), np.nan elif 'datetimetz' in upcast_classes: dtype = upcast_classes['datetimetz'] return dtype[0], tslib.iNaT @@ -4925,7 +4928,17 @@ def get_empty_dtype_and_na(join_units): elif 'timedelta' in upcast_classes: return np.dtype('m8[ns]'), tslib.iNaT else: # pragma - raise AssertionError("invalid dtype determination in get_concat_dtype") + g = np.find_common_type(upcast_classes, []) + if is_float_dtype(g): + return g, g.type(np.nan) + elif is_numeric_dtype(g): + if has_none_blocks: + return np.float64, np.nan + else: + return g, None + else: + msg = "invalid dtype determination in get_concat_dtype" + raise AssertionError(msg) def concatenate_join_units(join_units, concat_axis, copy): @@ -5190,7 +5203,6 @@ def is_null(self): return True def get_reindexed_values(self, empty_dtype, upcasted_na): - if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index a00f880ff6591..b92ffbfb6fe59 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -210,7 +210,7 @@ def f(): df.loc[3] = [6, 7] exp = DataFrame([[6, 7]], index=[3], columns=['A', 'B'], - dtype='float64') + dtype='object') tm.assert_frame_equal(df, exp) def test_series_partial_set(self): diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 5ab2bbc4ac6ba..df5e843097514 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -651,7 +651,7 @@ def test_interleave(self): mgr = create_mgr('a: f8; b: i8') self.assertEqual(mgr.as_matrix().dtype, 'f8') mgr = create_mgr('a: f4; b: i8') - self.assertEqual(mgr.as_matrix().dtype, 'f4') + self.assertEqual(mgr.as_matrix().dtype, 'f8') mgr = create_mgr('a: f4; b: i8; d: object') self.assertEqual(mgr.as_matrix().dtype, 'object') mgr = create_mgr('a: bool; b: i8') diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index d587e4ea6a1fa..24e26be15a44b 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -250,6 +250,7 @@ def test_basic_types(self): self.assertEqual(type(r), exp_df_type) r = get_dummies(s_df, sparse=self.sparse, columns=['a']) + exp_blk_type = pd.core.internals.IntBlock self.assertEqual(type(r[['a_0']]._data.blocks[0]), exp_blk_type) self.assertEqual(type(r[['a_1']]._data.blocks[0]), exp_blk_type) self.assertEqual(type(r[['a_2']]._data.blocks[0]), exp_blk_type) diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index a2b5773f551c9..a0b22892e74c5 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -13,6 +13,8 @@ makeCustomDataframe as mkdf, assert_almost_equal) +import pytest + class ConcatenateBase(tm.TestCase): @@ -1899,3 +1901,15 @@ def test_concat_multiindex_dfs_with_deepcopy(self): tm.assert_frame_equal(result_copy, expected) result_no_copy = pd.concat(example_dict, names=['testname']) tm.assert_frame_equal(result_no_copy, expected) + + +@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel]) +@pytest.mark.parametrize('dt', np.sctypes['float']) +def test_concat_no_unnecessary_upcast(dt, pdt): + # GH 13247 + dims = pdt().ndim + dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([np.nan], dtype=dt, ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims))] + x = pd.concat(dfs) + assert x.values.dtype == dt