-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: upcasting on reshaping ops #13247 #13337
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,7 +43,7 @@ Backwards incompatible API changes | |
.. _whatsnew_0190.api: | ||
|
||
|
||
|
||
- Concating multiple objects will no longer result in automatically upcast to `float64`, and instead try to find the smallest `dtype` that would suffice (:issue:`13247`) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move to 0.20.0 |
||
|
||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
array_equivalent, _is_na_compat, | ||
_maybe_convert_string_to_object, | ||
_maybe_convert_scalar, | ||
is_float_dtype, is_numeric_dtype, | ||
is_categorical, is_datetimelike_v_numeric, | ||
is_numeric_v_string_like, is_extension_type) | ||
import pandas.core.algorithms as algos | ||
|
@@ -4443,6 +4444,8 @@ def _lcd_dtype(l): | |
return np.dtype('int%s' % (lcd.itemsize * 8 * 2)) | ||
return lcd | ||
|
||
elif have_int and have_float and not have_complex: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. side issue: may want to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
return np.dtype('float64') | ||
elif have_complex: | ||
return np.dtype('c16') | ||
else: | ||
|
@@ -4785,6 +4788,8 @@ def get_empty_dtype_and_na(join_units): | |
upcast_cls = 'datetime' | ||
elif is_timedelta64_dtype(dtype): | ||
upcast_cls = 'timedelta' | ||
elif is_float_dtype(dtype) or is_numeric_dtype(dtype): | ||
upcast_cls = dtype.name | ||
else: | ||
upcast_cls = 'float' | ||
|
||
|
@@ -4809,8 +4814,6 @@ def get_empty_dtype_and_na(join_units): | |
return np.dtype(np.bool_), None | ||
elif 'category' in upcast_classes: | ||
return np.dtype(np.object_), np.nan | ||
elif 'float' in upcast_classes: | ||
return np.dtype(np.float64), np.nan | ||
elif 'datetimetz' in upcast_classes: | ||
dtype = upcast_classes['datetimetz'] | ||
return dtype[0], tslib.iNaT | ||
|
@@ -4819,7 +4822,17 @@ def get_empty_dtype_and_na(join_units): | |
elif 'timedelta' in upcast_classes: | ||
return np.dtype('m8[ns]'), tslib.iNaT | ||
else: # pragma | ||
raise AssertionError("invalid dtype determination in get_concat_dtype") | ||
g = np.find_common_type(upcast_classes, []) | ||
if is_float_dtype(g): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ahh you are doing this here; so similar to above in lcd dtypes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use pandas.types._find_common_type |
||
return g, g.type(np.nan) | ||
elif is_numeric_dtype(g): | ||
if has_none_blocks: | ||
return np.float64, np.nan | ||
else: | ||
return g, None | ||
else: | ||
msg = "invalid dtype determination in get_concat_dtype" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried to stick to the exact error message as before (to minimize my change set). Here is the code coverage: https://codecov.io/gh/pydata/pandas/compare/2e3c82e81bf00af157268a842a270a6181fcb168...7ac5a112f3d210290e1a08fea1ad25bee0134816 |
||
raise AssertionError(msg) | ||
|
||
|
||
def concatenate_join_units(join_units, concat_axis, copy): | ||
|
@@ -5083,7 +5096,6 @@ def is_null(self): | |
return True | ||
|
||
def get_reindexed_values(self, empty_dtype, upcasted_na): | ||
|
||
if upcasted_na is None: | ||
# No upcasting is necessary | ||
fill_value = self.block.fill_value | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4035,11 +4035,11 @@ def f(): | |
|
||
self.assertRaises(ValueError, f) | ||
|
||
# these are coerced to float unavoidably (as its a list-like to begin) | ||
# these are coerced to object unavoidably (as its a list-like to begin) | ||
df = DataFrame(columns=['A', 'B']) | ||
df.loc[3] = [6, 7] | ||
assert_frame_equal(df, DataFrame( | ||
[[6, 7]], index=[3], columns=['A', 'B'], dtype='float64')) | ||
[[6, 7]], index=[3], columns=['A', 'B'], dtype='object')) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this we don't want to change though. This is a bit tricky because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please elaborate some more? I am not sure what you mean, and why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ping @jreback . There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
def test_partial_setting_with_datetimelike_dtype(self): | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -655,7 +655,7 @@ def test_interleave(self): | |
mgr = create_mgr('a: f8; b: i8') | ||
self.assertEqual(mgr.as_matrix().dtype, 'f8') | ||
mgr = create_mgr('a: f4; b: i8') | ||
self.assertEqual(mgr.as_matrix().dtype, 'f4') | ||
self.assertEqual(mgr.as_matrix().dtype, 'f8') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ideally this would be f4 still There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can take another look at this. It can go both ways. There is loss of precision of big integers when going from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh sorry, I thought this was ok, yeah I think if you have integers then you should just force the upcast (for i8), yeah I suppose you can get more creative esp if you have i2 or something, but prob not worth the effort. I would do what numpy does for these numeric castings (e.g. use |
||
mgr = create_mgr('a: f4; b: i8; d: object') | ||
self.assertEqual(mgr.as_matrix().dtype, 'object') | ||
mgr = create_mgr('a: bool; b: i8') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1031,6 +1031,40 @@ def test_concat_invalid_first_argument(self): | |
expected = read_csv(StringIO(data)) | ||
assert_frame_equal(result, expected) | ||
|
||
def test_concat_no_unnecessary_upcasts(self): | ||
# fixes #13247 | ||
|
||
for pdt in [pd.Series, pd.DataFrame, pd.Panel, pd.Panel4D]: | ||
dims = pdt().ndim | ||
for dt in np.sctypes['float']: | ||
dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)), | ||
pdt(np.array([np.nan], dtype=dt, ndmin=dims)), | ||
pdt(np.array([5], dtype=dt, ndmin=dims))] | ||
x = pd.concat(dfs) | ||
self.assertTrue(x.values.dtype == dt) | ||
|
||
for dt in (np.sctypes['int'] + np.sctypes['uint']): | ||
dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)), | ||
pdt(np.array([5], dtype=dt, ndmin=dims))] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some tests that have mixed dtypes in the input as well (e.g. mixed float32/64 and int types) |
||
x = pd.concat(dfs) | ||
self.assertTrue(x.values.dtype == dt) | ||
|
||
objs = [] | ||
objs.append(pdt(np.array([1], dtype=np.float32, ndmin=dims))) | ||
objs.append(pdt(np.array([1], dtype=np.float16, ndmin=dims))) | ||
self.assertTrue(pd.concat(objs).values.dtype == np.float32) | ||
|
||
objs = [] | ||
objs.append(pdt(np.array([1], dtype=np.int32, ndmin=dims))) | ||
objs.append(pdt(np.array([1], dtype=np.int64, ndmin=dims))) | ||
self.assertTrue(pd.concat(objs).values.dtype == np.int64) | ||
|
||
# not sure what is the best answer here | ||
objs = [] | ||
objs.append(pdt(np.array([1], dtype=np.int32, ndmin=dims))) | ||
objs.append(pdt(np.array([1], dtype=np.float16, ndmin=dims))) | ||
self.assertTrue(pd.concat(objs).values.dtype == np.float64) | ||
|
||
|
||
if __name__ == '__main__': | ||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
did you put this in 0.19.0 on purpose? I think this is prob ok for 0.18.2
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, because was tagged with "Next major release", so I thought it was supposed to go in 0.19. #13247. I can easily move back.