Skip to content

BUG: upcasting on reshaping ops #13247 #15594

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -886,3 +886,5 @@ Bug Fixes
- Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`)
- Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`)
- Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`)

- Concating multiple objects will no longer result in automatically upcast to `float64`, and instead try to find the smallest `dtype` that would suffice (:issue:`13247`)
20 changes: 16 additions & 4 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
is_datetime64tz_dtype,
is_object_dtype,
is_datetimelike_v_numeric,
is_float_dtype, is_numeric_dtype,
is_numeric_v_string_like, is_extension_type,
is_list_like,
is_re,
Expand Down Expand Up @@ -4522,6 +4523,8 @@ def _interleaved_dtype(blocks):
return np.dtype('int%s' % (lcd.itemsize * 8 * 2))
return lcd

elif have_int and have_float and not have_complex:
return np.dtype('float64')
elif have_complex:
return np.dtype('c16')
else:
Expand Down Expand Up @@ -4891,6 +4894,8 @@ def get_empty_dtype_and_na(join_units):
upcast_cls = 'datetime'
elif is_timedelta64_dtype(dtype):
upcast_cls = 'timedelta'
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
upcast_cls = dtype.name
else:
upcast_cls = 'float'

Expand All @@ -4915,8 +4920,6 @@ def get_empty_dtype_and_na(join_units):
return np.dtype(np.bool_), None
elif 'category' in upcast_classes:
return np.dtype(np.object_), np.nan
elif 'float' in upcast_classes:
return np.dtype(np.float64), np.nan
elif 'datetimetz' in upcast_classes:
dtype = upcast_classes['datetimetz']
return dtype[0], tslib.iNaT
Expand All @@ -4925,7 +4928,17 @@ def get_empty_dtype_and_na(join_units):
elif 'timedelta' in upcast_classes:
return np.dtype('m8[ns]'), tslib.iNaT
else: # pragma
raise AssertionError("invalid dtype determination in get_concat_dtype")
g = np.find_common_type(upcast_classes, [])
if is_float_dtype(g):
return g, g.type(np.nan)
elif is_numeric_dtype(g):
if has_none_blocks:
return np.float64, np.nan
else:
return g, None
else:
msg = "invalid dtype determination in get_concat_dtype"
raise AssertionError(msg)


def concatenate_join_units(join_units, concat_axis, copy):
Expand Down Expand Up @@ -5190,7 +5203,6 @@ def is_null(self):
return True

def get_reindexed_values(self, empty_dtype, upcasted_na):

if upcasted_na is None:
# No upcasting is necessary
fill_value = self.block.fill_value
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexing/test_partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def f():
df.loc[3] = [6, 7]

exp = DataFrame([[6, 7]], index=[3], columns=['A', 'B'],
dtype='float64')
dtype='object')
tm.assert_frame_equal(df, exp)

def test_series_partial_set(self):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ def test_interleave(self):
mgr = create_mgr('a: f8; b: i8')
self.assertEqual(mgr.as_matrix().dtype, 'f8')
mgr = create_mgr('a: f4; b: i8')
self.assertEqual(mgr.as_matrix().dtype, 'f4')
self.assertEqual(mgr.as_matrix().dtype, 'f8')
mgr = create_mgr('a: f4; b: i8; d: object')
self.assertEqual(mgr.as_matrix().dtype, 'object')
mgr = create_mgr('a: bool; b: i8')
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ def test_basic_types(self):
self.assertEqual(type(r), exp_df_type)

r = get_dummies(s_df, sparse=self.sparse, columns=['a'])
exp_blk_type = pd.core.internals.IntBlock
self.assertEqual(type(r[['a_0']]._data.blocks[0]), exp_blk_type)
self.assertEqual(type(r[['a_1']]._data.blocks[0]), exp_blk_type)
self.assertEqual(type(r[['a_2']]._data.blocks[0]), exp_blk_type)
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/tools/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
makeCustomDataframe as mkdf,
assert_almost_equal)

import pytest


class ConcatenateBase(tm.TestCase):

Expand Down Expand Up @@ -1899,3 +1901,15 @@ def test_concat_multiindex_dfs_with_deepcopy(self):
tm.assert_frame_equal(result_copy, expected)
result_no_copy = pd.concat(example_dict, names=['testname'])
tm.assert_frame_equal(result_no_copy, expected)


@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
@pytest.mark.parametrize('dt', np.sctypes['float'])
def test_concat_no_unnecessary_upcast(dt, pdt):
# GH 13247
dims = pdt().ndim
dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
pdt(np.array([np.nan], dtype=dt, ndmin=dims)),
pdt(np.array([5], dtype=dt, ndmin=dims))]
x = pd.concat(dfs)
assert x.values.dtype == dt