BUG: upcasting on reshaping ops pandas-dev#13247

jaehoonhwang · jaehoonhwang · commit 468baeebe8b0 · 2017-03-05T22:40:15.000-08:00
Only rebasing and fixing the merge conflicts Original work done by: jennolsen84 Original branch: https://github.com/jennolsen84/pandas/tree/concatnan
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -771,3 +771,5 @@ Bug Fixes
 - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`)
 - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`)
 - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`)
+
+- Concating multiple objects will no longer result in automatically upcast to `float64`, and instead try to find the smallest `dtype` that would suffice (:issue:`13247`)
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -31,6 +31,7 @@
                                _maybe_convert_string_to_object,
                                _maybe_upcast,
                                _maybe_convert_scalar, _maybe_promote,
+                               is_float_dtype, is_numeric_dtype,
                                _infer_dtype_from_scalar,
                                _soft_convert_objects,
                                _possibly_convert_objects,
@@ -4523,6 +4524,8 @@ def _interleaved_dtype(blocks):
             return np.dtype('int%s' % (lcd.itemsize * 8 * 2))
         return lcd
 
+    elif have_int and have_float and not have_complex:
+        return np.dtype('float64')
     elif have_complex:
         return np.dtype('c16')
     else:
@@ -4892,6 +4895,8 @@ def get_empty_dtype_and_na(join_units):
             upcast_cls = 'datetime'
         elif is_timedelta64_dtype(dtype):
             upcast_cls = 'timedelta'
+        elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
+            upcast_cls = dtype.name
         else:
             upcast_cls = 'float'
 
@@ -4916,8 +4921,6 @@ def get_empty_dtype_and_na(join_units):
             return np.dtype(np.bool_), None
     elif 'category' in upcast_classes:
         return np.dtype(np.object_), np.nan
-    elif 'float' in upcast_classes:
-        return np.dtype(np.float64), np.nan
     elif 'datetimetz' in upcast_classes:
         dtype = upcast_classes['datetimetz']
         return dtype[0], tslib.iNaT
@@ -4926,7 +4929,17 @@ def get_empty_dtype_and_na(join_units):
     elif 'timedelta' in upcast_classes:
         return np.dtype('m8[ns]'), tslib.iNaT
     else:  # pragma
-        raise AssertionError("invalid dtype determination in get_concat_dtype")
+        g = pandas.types._.find_common_type(upcast_classes, [])
+        if is_float_type(g):
+            return g, g.type(np.nan)
+        elif is_numeric_dtype(g):
+            if has_none_blocks:
+                return np.float64, np.nan
+            else:
+                return g, None
+        else:
+            msg = "invalid dtype determination in get_concat_dtype"
+            raise AssertionError(msg)
 
 
 def concatenate_join_units(join_units, concat_axis, copy):
@@ -5191,7 +5204,6 @@ def is_null(self):
         return True
 
     def get_reindexed_values(self, empty_dtype, upcasted_na):
-
         if upcasted_na is None:
             # No upcasting is necessary
             fill_value = self.block.fill_value
diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py
@@ -651,7 +651,7 @@ def test_interleave(self):
         mgr = create_mgr('a: f8; b: i8')
         self.assertEqual(mgr.as_matrix().dtype, 'f8')
         mgr = create_mgr('a: f4; b: i8')
-        self.assertEqual(mgr.as_matrix().dtype, 'f4')
+        self.assertEqual(mgr.as_matrix().dtype, 'f8')
         mgr = create_mgr('a: f4; b: i8; d: object')
         self.assertEqual(mgr.as_matrix().dtype, 'object')
         mgr = create_mgr('a: bool; b: i8')
diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py
@@ -1899,3 +1899,38 @@ def test_concat_multiindex_dfs_with_deepcopy(self):
         tm.assert_frame_equal(result_copy, expected)
         result_no_copy = pd.concat(example_dict, names=['testname'])
         tm.assert_frame_equal(result_no_copy, expected)
+
+
+    def test_concat_no_unnecessary_upcats(self):
+        # GH 13247
+
+        for pdt in [pd.Series, pd.DataFrame, pd.Panel, pd.Panel4D]:
+            dims = pdt().ndim
+            for dt in np.sctypes['float']:
+                dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
+                       pdt(np.array([np.nan], dtype=dt, ndmin=dims)),
+                       pdt(np.array([5], dtype=dt, ndmin=dims))]
+                x = pd.concat(dfs)
+                self.assertTrue(x.values.dtype == dt)
+
+            for dt in(np.sctypes['int'] + np.sctypes['uint']):
+                dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
+                       pdt(np.array([5], dtype=dt ,ndmin=dims))]
+                x = pd.concat(dfs)
+                self.assertTrue(x.values.dtype == dt)
+
+            objs = []
+            objs.append(pdt(np.array([1], dtype=np.float32, ndmin=dims)))
+            objs.append(pdt(np.array([1], dtype=np.float16, ndmin=dims)))
+            self.assertTrue(pd.concat(objs).values.dtype == np.float32)
+
+            objs = []
+            objs.append(pdt(np.array([1], dtype=np.int32, ndmin=dims)))
+            objs.append(pdt(np.array([1], dtype=np.int64, ndmin=dims)))
+            self.assertTrue(pd.concat(objs).values.dtype == np.int64)
+
+            # not sure what is the best answer here
+            objs = []
+            objs.append(pdt(np.array([1], dtype=np.int32, ndmin=dims)))
+            objs.append(pdt(np.array([1], dtype=np.float16, ndmin=dims)))
+            self.assertTrue(pd.concat(objs).values.dtype == np.float64)