Skip to content

Commit 468baee

Browse files
committed
BUG: upcasting on reshaping ops pandas-dev#13247
Only rebasing and fixing the merge conflicts Original work done by: jennolsen84 Original branch: https://github.com/jennolsen84/pandas/tree/concatnan
1 parent 09360d8 commit 468baee

File tree

4 files changed

+54
-5
lines changed

4 files changed

+54
-5
lines changed

doc/source/whatsnew/v0.20.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -771,3 +771,5 @@ Bug Fixes
771771
- Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`)
772772
- Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`)
773773
- Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`)
774+
775+
- Concating multiple objects will no longer result in automatically upcast to `float64`, and instead try to find the smallest `dtype` that would suffice (:issue:`13247`)

pandas/core/internals.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
_maybe_convert_string_to_object,
3232
_maybe_upcast,
3333
_maybe_convert_scalar, _maybe_promote,
34+
is_float_dtype, is_numeric_dtype,
3435
_infer_dtype_from_scalar,
3536
_soft_convert_objects,
3637
_possibly_convert_objects,
@@ -4523,6 +4524,8 @@ def _interleaved_dtype(blocks):
45234524
return np.dtype('int%s' % (lcd.itemsize * 8 * 2))
45244525
return lcd
45254526

4527+
elif have_int and have_float and not have_complex:
4528+
return np.dtype('float64')
45264529
elif have_complex:
45274530
return np.dtype('c16')
45284531
else:
@@ -4892,6 +4895,8 @@ def get_empty_dtype_and_na(join_units):
48924895
upcast_cls = 'datetime'
48934896
elif is_timedelta64_dtype(dtype):
48944897
upcast_cls = 'timedelta'
4898+
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
4899+
upcast_cls = dtype.name
48954900
else:
48964901
upcast_cls = 'float'
48974902

@@ -4916,8 +4921,6 @@ def get_empty_dtype_and_na(join_units):
49164921
return np.dtype(np.bool_), None
49174922
elif 'category' in upcast_classes:
49184923
return np.dtype(np.object_), np.nan
4919-
elif 'float' in upcast_classes:
4920-
return np.dtype(np.float64), np.nan
49214924
elif 'datetimetz' in upcast_classes:
49224925
dtype = upcast_classes['datetimetz']
49234926
return dtype[0], tslib.iNaT
@@ -4926,7 +4929,17 @@ def get_empty_dtype_and_na(join_units):
49264929
elif 'timedelta' in upcast_classes:
49274930
return np.dtype('m8[ns]'), tslib.iNaT
49284931
else: # pragma
4929-
raise AssertionError("invalid dtype determination in get_concat_dtype")
4932+
g = pandas.types._.find_common_type(upcast_classes, [])
4933+
if is_float_type(g):
4934+
return g, g.type(np.nan)
4935+
elif is_numeric_dtype(g):
4936+
if has_none_blocks:
4937+
return np.float64, np.nan
4938+
else:
4939+
return g, None
4940+
else:
4941+
msg = "invalid dtype determination in get_concat_dtype"
4942+
raise AssertionError(msg)
49304943

49314944

49324945
def concatenate_join_units(join_units, concat_axis, copy):
@@ -5191,7 +5204,6 @@ def is_null(self):
51915204
return True
51925205

51935206
def get_reindexed_values(self, empty_dtype, upcasted_na):
5194-
51955207
if upcasted_na is None:
51965208
# No upcasting is necessary
51975209
fill_value = self.block.fill_value

pandas/tests/test_internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,7 @@ def test_interleave(self):
651651
mgr = create_mgr('a: f8; b: i8')
652652
self.assertEqual(mgr.as_matrix().dtype, 'f8')
653653
mgr = create_mgr('a: f4; b: i8')
654-
self.assertEqual(mgr.as_matrix().dtype, 'f4')
654+
self.assertEqual(mgr.as_matrix().dtype, 'f8')
655655
mgr = create_mgr('a: f4; b: i8; d: object')
656656
self.assertEqual(mgr.as_matrix().dtype, 'object')
657657
mgr = create_mgr('a: bool; b: i8')

pandas/tests/tools/test_concat.py

+35
Original file line numberDiff line numberDiff line change
@@ -1899,3 +1899,38 @@ def test_concat_multiindex_dfs_with_deepcopy(self):
18991899
tm.assert_frame_equal(result_copy, expected)
19001900
result_no_copy = pd.concat(example_dict, names=['testname'])
19011901
tm.assert_frame_equal(result_no_copy, expected)
1902+
1903+
1904+
def test_concat_no_unnecessary_upcats(self):
1905+
# GH 13247
1906+
1907+
for pdt in [pd.Series, pd.DataFrame, pd.Panel, pd.Panel4D]:
1908+
dims = pdt().ndim
1909+
for dt in np.sctypes['float']:
1910+
dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
1911+
pdt(np.array([np.nan], dtype=dt, ndmin=dims)),
1912+
pdt(np.array([5], dtype=dt, ndmin=dims))]
1913+
x = pd.concat(dfs)
1914+
self.assertTrue(x.values.dtype == dt)
1915+
1916+
for dt in(np.sctypes['int'] + np.sctypes['uint']):
1917+
dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
1918+
pdt(np.array([5], dtype=dt ,ndmin=dims))]
1919+
x = pd.concat(dfs)
1920+
self.assertTrue(x.values.dtype == dt)
1921+
1922+
objs = []
1923+
objs.append(pdt(np.array([1], dtype=np.float32, ndmin=dims)))
1924+
objs.append(pdt(np.array([1], dtype=np.float16, ndmin=dims)))
1925+
self.assertTrue(pd.concat(objs).values.dtype == np.float32)
1926+
1927+
objs = []
1928+
objs.append(pdt(np.array([1], dtype=np.int32, ndmin=dims)))
1929+
objs.append(pdt(np.array([1], dtype=np.int64, ndmin=dims)))
1930+
self.assertTrue(pd.concat(objs).values.dtype == np.int64)
1931+
1932+
# not sure what is the best answer here
1933+
objs = []
1934+
objs.append(pdt(np.array([1], dtype=np.int32, ndmin=dims)))
1935+
objs.append(pdt(np.array([1], dtype=np.float16, ndmin=dims)))
1936+
self.assertTrue(pd.concat(objs).values.dtype == np.float64)

0 commit comments

Comments
 (0)