-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: multi-type SparseDataFrame fixes and improvements #13917
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
2e833fa
fb6237c
c7fb0f2
114217e
33973a5
93d2de6
6782bc7
2104948
ac790d7
eebcb23
8d675ad
442b8c1
926ca1e
057d56b
8c7d1ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,8 @@ | |
_infer_dtype_from_scalar, | ||
_soft_convert_objects, | ||
_possibly_convert_objects, | ||
_astype_nansafe) | ||
_astype_nansafe, | ||
_find_common_type) | ||
from pandas.types.missing import (isnull, array_equivalent, | ||
_is_na_compat, | ||
is_null_datelike_scalar) | ||
|
@@ -4435,14 +4436,6 @@ def _interleaved_dtype(blocks): | |
for x in blocks: | ||
counts[type(x)].append(x) | ||
|
||
def _lcd_dtype(l): | ||
""" find the lowest dtype that can accomodate the given types """ | ||
m = l[0].dtype | ||
for x in l[1:]: | ||
if x.dtype.itemsize > m.itemsize: | ||
m = x.dtype | ||
return m | ||
|
||
have_int = len(counts[IntBlock]) > 0 | ||
have_bool = len(counts[BoolBlock]) > 0 | ||
have_object = len(counts[ObjectBlock]) > 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what I meant was can start off by simply moving this (your version or new one with |
||
|
@@ -4455,7 +4448,6 @@ def _lcd_dtype(l): | |
# TODO: have_sparse is not used | ||
have_sparse = len(counts[SparseBlock]) > 0 # noqa | ||
have_numeric = have_float or have_complex or have_int | ||
|
||
has_non_numeric = have_dt64 or have_dt64_tz or have_td64 or have_cat | ||
|
||
if (have_object or | ||
|
@@ -4467,10 +4459,9 @@ def _lcd_dtype(l): | |
elif have_bool: | ||
return np.dtype(bool) | ||
elif have_int and not have_float and not have_complex: | ||
|
||
# if we are mixing unsigned and signed, then return | ||
# the next biggest int type (if we can) | ||
lcd = _lcd_dtype(counts[IntBlock]) | ||
lcd = _find_common_type([b.dtype for b in counts[IntBlock]]) | ||
kinds = set([i.dtype.kind for i in counts[IntBlock]]) | ||
if len(kinds) == 1: | ||
return lcd | ||
|
@@ -4486,7 +4477,8 @@ def _lcd_dtype(l): | |
elif have_complex: | ||
return np.dtype('c16') | ||
else: | ||
return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock]) | ||
introspection_blks = counts[FloatBlock] + counts[SparseBlock] | ||
return _find_common_type([b.dtype for b in introspection_blks]) | ||
|
||
|
||
def _consolidate(blocks): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -104,15 +104,21 @@ def test_as_matrix_lcd(self): | |
values = self.mixed_float.as_matrix(['C']) | ||
self.assertEqual(values.dtype, np.float16) | ||
|
||
# GH 10364 | ||
# B uint64 forces float because there are other signed int types | ||
values = self.mixed_int.as_matrix(['A', 'B', 'C', 'D']) | ||
self.assertEqual(values.dtype, np.int64) | ||
self.assertEqual(values.dtype, np.float64) | ||
|
||
values = self.mixed_int.as_matrix(['A', 'D']) | ||
self.assertEqual(values.dtype, np.int64) | ||
|
||
# guess all ints are cast to uints.... | ||
# B uint64 forces float because there are other signed int types | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this might fix another bug, can you search for uint64 issues and see? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add the issue as a reference here |
||
values = self.mixed_int.as_matrix(['A', 'B', 'C']) | ||
self.assertEqual(values.dtype, np.int64) | ||
self.assertEqual(values.dtype, np.float64) | ||
|
||
# as B and C are both unsigned, no forcing to float is needed | ||
values = self.mixed_int.as_matrix(['B', 'C']) | ||
self.assertEqual(values.dtype, np.uint64) | ||
|
||
values = self.mixed_int.as_matrix(['A', 'C']) | ||
self.assertEqual(values.dtype, np.int32) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,10 @@ | |
_possibly_convert_objects, | ||
_infer_dtype_from_scalar, | ||
_maybe_convert_string_to_object, | ||
_maybe_convert_scalar) | ||
_maybe_convert_scalar, | ||
_find_common_type) | ||
from pandas.types.dtypes import (CategoricalDtype, | ||
DatetimeTZDtype) | ||
from pandas.util import testing as tm | ||
|
||
_multiprocess_can_split_ = True | ||
|
@@ -188,6 +191,46 @@ def test_possibly_convert_objects_copy(self): | |
self.assertTrue(values is not out) | ||
|
||
|
||
class TestCommonTypes(tm.TestCase): | ||
def test_numpy_dtypes(self): | ||
# (source_types, destination_type) | ||
testcases = ( | ||
# identity | ||
((np.int64,), np.int64), | ||
((np.uint64,), np.uint64), | ||
((np.float32,), np.float32), | ||
((np.object,), np.object), | ||
|
||
# into ints | ||
((np.int16, np.int64), np.int64), | ||
((np.int32, np.uint32), np.int64), | ||
((np.uint16, np.uint64), np.uint64), | ||
|
||
# into floats | ||
((np.float16, np.float32), np.float32), | ||
((np.float16, np.int16), np.float32), | ||
((np.float32, np.int16), np.float32), | ||
((np.uint64, np.int64), np.float64), | ||
((np.int16, np.float64), np.float64), | ||
((np.float16, np.int64), np.float64), | ||
|
||
# into others | ||
((np.complex128, np.int32), np.complex128), | ||
((np.object, np.float32), np.object), | ||
((np.object, np.int16), np.object), | ||
) | ||
for src, common in testcases: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice! |
||
self.assertEqual(_find_common_type(src), common) | ||
|
||
def test_pandas_dtypes(self): | ||
# TODO: not implemented yet | ||
with self.assertRaises(TypeError): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put a TODO: this is not implemented |
||
self.assertEqual(_find_common_type([CategoricalDtype()]), | ||
CategoricalDtype) | ||
with self.assertRaises(TypeError): | ||
self.assertEqual(_find_common_type([DatetimeTZDtype()]), | ||
DatetimeTZDtype) | ||
|
||
if __name__ == '__main__': | ||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], | ||
exit=False) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
_ensure_int32, _ensure_int64, | ||
_NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, | ||
_DATELIKE_DTYPES, _POSSIBLY_CAST_DTYPES) | ||
from .dtypes import ExtensionDtype | ||
from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries | ||
from .missing import isnull, notnull | ||
from .inference import is_list_like | ||
|
@@ -861,3 +862,12 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): | |
value = _possibly_infer_to_datetimelike(value) | ||
|
||
return value | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some tests for validation on this (there might be some existing) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IOW, just run it thru the standard types for now, and asserting that it raises for pandas dtypes |
||
def _find_common_type(types): | ||
"""Find a common data type among the given dtypes.""" | ||
# TODO: enable using pandas-specific types | ||
if any(isinstance(t, ExtensionDtype) for t in types): | ||
raise TypeError("Common type discovery is currently only " | ||
"supported for pure numpy dtypes.") | ||
return np.find_common_type(types, []) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
, types where previously...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This reads fine to me? There's a colon after
SparseDataFrame
sThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
change : to ,
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh wait sorry I misread your comment.