-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: multi-type SparseDataFrame fixes and improvements #13917
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
2e833fa
fb6237c
c7fb0f2
114217e
33973a5
93d2de6
6782bc7
2104948
ac790d7
eebcb23
8d675ad
442b8c1
926ca1e
057d56b
8c7d1ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -437,6 +437,7 @@ API changes | |
- ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) | ||
- ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) | ||
- ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) | ||
- ``.values`` will now return ``np.float64`` with a ``DataFrame`` with ``np.int64`` and ``np.uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`) | ||
|
||
|
||
|
||
|
@@ -764,6 +765,7 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan` | |
- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) | ||
- Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) | ||
- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) | ||
- Bug when interacting with multi-type SparseDataFrames: single row slicing now works because types are not forced to float (:issue:`13917`) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug in single row slicing on multi-dtype |
||
|
||
.. _whatsnew_0190.deprecations: | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,8 @@ | |
_infer_dtype_from_scalar, | ||
_soft_convert_objects, | ||
_possibly_convert_objects, | ||
_astype_nansafe) | ||
_astype_nansafe, | ||
_find_common_type) | ||
from pandas.types.missing import (isnull, array_equivalent, | ||
_is_na_compat, | ||
is_null_datelike_scalar) | ||
|
@@ -4435,14 +4436,6 @@ def _interleaved_dtype(blocks): | |
for x in blocks: | ||
counts[type(x)].append(x) | ||
|
||
def _lcd_dtype(l): | ||
""" find the lowest dtype that can accomodate the given types """ | ||
m = l[0].dtype | ||
for x in l[1:]: | ||
if x.dtype.itemsize > m.itemsize: | ||
m = x.dtype | ||
return m | ||
|
||
have_int = len(counts[IntBlock]) > 0 | ||
have_bool = len(counts[BoolBlock]) > 0 | ||
have_object = len(counts[ObjectBlock]) > 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what I meant was can start off by simply moving this (your version or new one with |
||
|
@@ -4455,7 +4448,6 @@ def _lcd_dtype(l): | |
# TODO: have_sparse is not used | ||
have_sparse = len(counts[SparseBlock]) > 0 # noqa | ||
have_numeric = have_float or have_complex or have_int | ||
|
||
has_non_numeric = have_dt64 or have_dt64_tz or have_td64 or have_cat | ||
|
||
if (have_object or | ||
|
@@ -4467,10 +4459,9 @@ def _lcd_dtype(l): | |
elif have_bool: | ||
return np.dtype(bool) | ||
elif have_int and not have_float and not have_complex: | ||
|
||
# if we are mixing unsigned and signed, then return | ||
# the next biggest int type (if we can) | ||
lcd = _lcd_dtype(counts[IntBlock]) | ||
lcd = _find_common_type([b.dtype for b in counts[IntBlock]]) | ||
kinds = set([i.dtype.kind for i in counts[IntBlock]]) | ||
if len(kinds) == 1: | ||
return lcd | ||
|
@@ -4486,7 +4477,8 @@ def _lcd_dtype(l): | |
elif have_complex: | ||
return np.dtype('c16') | ||
else: | ||
return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock]) | ||
introspection_blks = counts[FloatBlock] + counts[SparseBlock] | ||
return _find_common_type([b.dtype for b in introspection_blks]) | ||
|
||
|
||
def _consolidate(blocks): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import pandas.util.testing as tm | ||
|
||
|
||
class TestSparseDataFrameMultitype(tm.TestCase): | ||
def setUp(self): | ||
super(TestSparseDataFrameMultitype, self).setUp() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you don't need the super |
||
self.string_series = pd.SparseSeries(['a', 'b', 'c']) | ||
self.int_series = pd.SparseSeries([1, 2, 3]) | ||
self.float_series = pd.SparseSeries([1.1, 1.2, 1.3]) | ||
self.object_series = pd.SparseSeries([[], {}, set()]) | ||
self.sdf = pd.SparseDataFrame({ | ||
'string': self.string_series, | ||
'int': self.int_series, | ||
'float': self.float_series, | ||
'object': self.object_series, | ||
}) | ||
self.cols = ['string', 'int', 'float', 'object'] | ||
self.sdf = self.sdf[self.cols] | ||
|
||
def test_basic_dtypes(self): | ||
for _, row in self.sdf.iterrows(): | ||
self.assertEqual(row.dtype, object) | ||
tm.assert_sp_series_equal(self.sdf['string'], self.string_series, | ||
check_names=False) | ||
tm.assert_sp_series_equal(self.sdf['int'], self.int_series, | ||
check_names=False) | ||
tm.assert_sp_series_equal(self.sdf['float'], self.float_series, | ||
check_names=False) | ||
tm.assert_sp_series_equal(self.sdf['object'], self.object_series, | ||
check_names=False) | ||
|
||
def test_indexing_single(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we have a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make a TestMultitype class in indexing.py instead of having a separate file? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes. |
||
tm.assert_sp_series_equal(self.sdf.iloc[0], | ||
pd.SparseSeries(['a', 1, 1.1, []], | ||
index=self.cols), | ||
check_names=False) | ||
tm.assert_sp_series_equal(self.sdf.iloc[1], | ||
pd.SparseSeries(['b', 2, 1.2, {}], | ||
index=self.cols), | ||
check_names=False) | ||
tm.assert_sp_series_equal(self.sdf.iloc[2], | ||
pd.SparseSeries(['c', 3, 1.3, set()], | ||
index=self.cols), | ||
check_names=False) | ||
|
||
def test_indexing_multiple(self): | ||
tm.assert_sp_frame_equal(self.sdf, self.sdf[:]) | ||
tm.assert_sp_frame_equal(self.sdf, self.sdf.loc[:]) | ||
tm.assert_sp_frame_equal(self.sdf.iloc[[1, 2]], | ||
pd.SparseDataFrame({ | ||
'string': self.string_series.iloc[[1, 2]], | ||
'int': self.int_series.iloc[[1, 2]], | ||
'float': self.float_series.iloc[[1, 2]], | ||
'object': self.object_series.iloc[[1, 2]] | ||
}, index=[1, 2])[self.cols]) | ||
tm.assert_sp_frame_equal(self.sdf[['int', 'string']], | ||
pd.SparseDataFrame({ | ||
'int': self.int_series, | ||
'string': self.string_series, | ||
})) | ||
|
||
|
||
class TestSparseSeriesMultitype(tm.TestCase): | ||
def setUp(self): | ||
super(TestSparseSeriesMultitype, self).setUp() | ||
self.index = ['string', 'int', 'float', 'object'] | ||
self.ss = pd.SparseSeries(['a', 1, 1.1, []], | ||
index=self.index) | ||
|
||
def test_indexing_single(self): | ||
for i, idx in enumerate(self.index): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here |
||
self.assertEqual(self.ss.iloc[i], self.ss[idx]) | ||
self.assertEqual(type(self.ss.iloc[i]), | ||
type(self.ss[idx])) | ||
self.assertEqual(self.ss['string'], 'a') | ||
self.assertEqual(self.ss['int'], 1) | ||
self.assertEqual(self.ss['float'], 1.1) | ||
self.assertEqual(self.ss['object'], []) | ||
|
||
def test_indexing_multiple(self): | ||
tm.assert_sp_series_equal(self.ss.loc[['string', 'int']], | ||
pd.SparseSeries(['a', 1], | ||
index=['string', 'int'])) | ||
tm.assert_sp_series_equal(self.ss.loc[['string', 'object']], | ||
pd.SparseSeries(['a', []], | ||
index=['string', 'object'])) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -104,15 +104,21 @@ def test_as_matrix_lcd(self): | |
values = self.mixed_float.as_matrix(['C']) | ||
self.assertEqual(values.dtype, np.float16) | ||
|
||
# GH 10364 | ||
# B uint64 forces float because there are other signed int types | ||
values = self.mixed_int.as_matrix(['A', 'B', 'C', 'D']) | ||
self.assertEqual(values.dtype, np.int64) | ||
self.assertEqual(values.dtype, np.float64) | ||
|
||
values = self.mixed_int.as_matrix(['A', 'D']) | ||
self.assertEqual(values.dtype, np.int64) | ||
|
||
# guess all ints are cast to uints.... | ||
# B uint64 forces float because there are other signed int types | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this might fix another bug, can you search for uint64 issues and see? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add the issue as a reference here |
||
values = self.mixed_int.as_matrix(['A', 'B', 'C']) | ||
self.assertEqual(values.dtype, np.int64) | ||
self.assertEqual(values.dtype, np.float64) | ||
|
||
# as B and C are both unsigned, no forcing to float is needed | ||
values = self.mixed_int.as_matrix(['B', 'C']) | ||
self.assertEqual(values.dtype, np.uint64) | ||
|
||
values = self.mixed_int.as_matrix(['A', 'C']) | ||
self.assertEqual(values.dtype, np.int32) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,10 @@ | |
_possibly_convert_objects, | ||
_infer_dtype_from_scalar, | ||
_maybe_convert_string_to_object, | ||
_maybe_convert_scalar) | ||
_maybe_convert_scalar, | ||
_find_common_type) | ||
from pandas.types.dtypes import (CategoricalDtype, | ||
DatetimeTZDtype) | ||
from pandas.util import testing as tm | ||
|
||
_multiprocess_can_split_ = True | ||
|
@@ -188,6 +191,52 @@ def test_possibly_convert_objects_copy(self): | |
self.assertTrue(values is not out) | ||
|
||
|
||
class TestCommonTypes(tm.TestCase): | ||
def test_numpy_dtypes(self): | ||
# identity | ||
self.assertEqual(_find_common_type([np.int64]), np.int64) | ||
self.assertEqual(_find_common_type([np.uint64]), np.uint64) | ||
self.assertEqual(_find_common_type([np.float32]), np.float32) | ||
self.assertEqual(_find_common_type([np.object]), np.object) | ||
|
||
# into ints | ||
self.assertEqual(_find_common_type([np.int16, np.int64]), | ||
np.int64) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. group them eg ints floats easier to read |
||
self.assertEqual(_find_common_type([np.int32, np.uint32]), | ||
np.int64) | ||
self.assertEqual(_find_common_type([np.uint16, np.uint64]), | ||
np.uint64) | ||
|
||
# into floats | ||
self.assertEqual(_find_common_type([np.float16, np.float32]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make this shorter e.g.
and similarly as much as possible |
||
np.float32) | ||
self.assertEqual(_find_common_type([np.float16, np.int16]), | ||
np.float32) | ||
self.assertEqual(_find_common_type([np.float32, np.int16]), | ||
np.float32) | ||
self.assertEqual(_find_common_type([np.uint64, np.int64]), | ||
np.float64) | ||
self.assertEqual(_find_common_type([np.int16, np.float64]), | ||
np.float64) | ||
self.assertEqual(_find_common_type([np.float16, np.int64]), | ||
np.float64) | ||
|
||
# into others | ||
self.assertEqual(_find_common_type([np.complex128, np.int32]), | ||
np.complex128) | ||
self.assertEqual(_find_common_type([np.object, np.float32]), | ||
np.object) | ||
self.assertEqual(_find_common_type([np.object, np.int16]), | ||
np.object) | ||
|
||
def test_pandas_dtypes(self): | ||
with self.assertRaises(TypeError): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put a TODO: this is not implemented |
||
self.assertEqual(_find_common_type([CategoricalDtype()]), | ||
CategoricalDtype) | ||
with self.assertRaises(TypeError): | ||
self.assertEqual(_find_common_type([DatetimeTZDtype()]), | ||
DatetimeTZDtype) | ||
|
||
if __name__ == '__main__': | ||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], | ||
exit=False) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
_ensure_int32, _ensure_int64, | ||
_NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, | ||
_DATELIKE_DTYPES, _POSSIBLY_CAST_DTYPES) | ||
from .dtypes import ExtensionDtype | ||
from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries | ||
from .missing import isnull, notnull | ||
from .inference import is_list_like | ||
|
@@ -861,3 +862,12 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): | |
value = _possibly_infer_to_datetimelike(value) | ||
|
||
return value | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some tests for validation on this (there might be some existing) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IOW, just run it thru the standard types for now, and asserting that it raises for pandas dtypes |
||
def _find_common_type(types): | ||
"""Find a common data type among the given dtypes.""" | ||
# TODO: enable using pandas-specific types | ||
if any(isinstance(t, ExtensionDtype) for t in types): | ||
raise TypeError("Common type discovery is currently only " | ||
"supported for pure numpy dtypes.") | ||
return np.find_common_type(types, []) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DataFrame.values
will now ....with a frame of mixedint64
anduint64
dtypes.....