diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 42db0388ca5d9..b11b27716ce8b 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -43,7 +43,7 @@ Backwards incompatible API changes .. _whatsnew_0190.api: - +- Concating multiple objects will no longer result in automatically upcast to `float64`, and instead try to find the smallest `dtype` that would suffice (:issue:`13247`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 97df81ad6be48..be1ca0af802d1 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -19,6 +19,7 @@ array_equivalent, _is_na_compat, _maybe_convert_string_to_object, _maybe_convert_scalar, + is_float_dtype, is_numeric_dtype, is_categorical, is_datetimelike_v_numeric, is_numeric_v_string_like, is_extension_type) import pandas.core.algorithms as algos @@ -4443,6 +4444,8 @@ def _lcd_dtype(l): return np.dtype('int%s' % (lcd.itemsize * 8 * 2)) return lcd + elif have_int and have_float and not have_complex: + return np.dtype('float64') elif have_complex: return np.dtype('c16') else: @@ -4785,6 +4788,8 @@ def get_empty_dtype_and_na(join_units): upcast_cls = 'datetime' elif is_timedelta64_dtype(dtype): upcast_cls = 'timedelta' + elif is_float_dtype(dtype) or is_numeric_dtype(dtype): + upcast_cls = dtype.name else: upcast_cls = 'float' @@ -4809,8 +4814,6 @@ def get_empty_dtype_and_na(join_units): return np.dtype(np.bool_), None elif 'category' in upcast_classes: return np.dtype(np.object_), np.nan - elif 'float' in upcast_classes: - return np.dtype(np.float64), np.nan elif 'datetimetz' in upcast_classes: dtype = upcast_classes['datetimetz'] return dtype[0], tslib.iNaT @@ -4819,7 +4822,17 @@ def get_empty_dtype_and_na(join_units): elif 'timedelta' in upcast_classes: return np.dtype('m8[ns]'), tslib.iNaT else: # pragma - raise AssertionError("invalid dtype determination in get_concat_dtype") + g = np.find_common_type(upcast_classes, []) + if is_float_dtype(g): + return g, g.type(np.nan) + elif is_numeric_dtype(g): + if has_none_blocks: + return np.float64, np.nan + else: + return g, None + else: + msg = "invalid dtype determination in get_concat_dtype" + raise AssertionError(msg) def concatenate_join_units(join_units, concat_axis, copy): @@ -5083,7 +5096,6 @@ def is_null(self): return True def get_reindexed_values(self, empty_dtype, upcasted_na): - if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index b86b248ead290..36d4f18dd6a24 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -4035,11 +4035,11 @@ def f(): self.assertRaises(ValueError, f) - # these are coerced to float unavoidably (as its a list-like to begin) + # these are coerced to object unavoidably (as its a list-like to begin) df = DataFrame(columns=['A', 'B']) df.loc[3] = [6, 7] assert_frame_equal(df, DataFrame( - [[6, 7]], index=[3], columns=['A', 'B'], dtype='float64')) + [[6, 7]], index=[3], columns=['A', 'B'], dtype='object')) def test_partial_setting_with_datetimelike_dtype(self): diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 6a97f195abba7..44e0a42d1360a 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -655,7 +655,7 @@ def test_interleave(self): mgr = create_mgr('a: f8; b: i8') self.assertEqual(mgr.as_matrix().dtype, 'f8') mgr = create_mgr('a: f4; b: i8') - self.assertEqual(mgr.as_matrix().dtype, 'f4') + self.assertEqual(mgr.as_matrix().dtype, 'f8') mgr = create_mgr('a: f4; b: i8; d: object') self.assertEqual(mgr.as_matrix().dtype, 'object') mgr = create_mgr('a: bool; b: i8') diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 9d9b0635e0f35..f43d98fda6398 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -1031,6 +1031,40 @@ def test_concat_invalid_first_argument(self): expected = read_csv(StringIO(data)) assert_frame_equal(result, expected) + def test_concat_no_unnecessary_upcasts(self): + # fixes #13247 + + for pdt in [pd.Series, pd.DataFrame, pd.Panel, pd.Panel4D]: + dims = pdt().ndim + for dt in np.sctypes['float']: + dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([np.nan], dtype=dt, ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims))] + x = pd.concat(dfs) + self.assertTrue(x.values.dtype == dt) + + for dt in (np.sctypes['int'] + np.sctypes['uint']): + dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims))] + x = pd.concat(dfs) + self.assertTrue(x.values.dtype == dt) + + objs = [] + objs.append(pdt(np.array([1], dtype=np.float32, ndmin=dims))) + objs.append(pdt(np.array([1], dtype=np.float16, ndmin=dims))) + self.assertTrue(pd.concat(objs).values.dtype == np.float32) + + objs = [] + objs.append(pdt(np.array([1], dtype=np.int32, ndmin=dims))) + objs.append(pdt(np.array([1], dtype=np.int64, ndmin=dims))) + self.assertTrue(pd.concat(objs).values.dtype == np.int64) + + # not sure what is the best answer here + objs = [] + objs.append(pdt(np.array([1], dtype=np.int32, ndmin=dims))) + objs.append(pdt(np.array([1], dtype=np.float16, ndmin=dims))) + self.assertTrue(pd.concat(objs).values.dtype == np.float64) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],