Skip to content

BUG: multi-type SparseDataFrame fixes and improvements #13917

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,7 @@ Note that the limitation is applied to ``fill_value`` which default is ``np.nan`
- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`)
- Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`)
- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`)
- Bug when interacting with multi-type SparseDataFrames: single row slicing now works because types are not forced to float (:issue:`13917`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug in single row slicing on multi-dtype SparseDataFrame s.....


.. _whatsnew_0190.deprecations:

Expand Down
15 changes: 3 additions & 12 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -4435,14 +4435,6 @@ def _interleaved_dtype(blocks):
for x in blocks:
counts[type(x)].append(x)

def _lcd_dtype(l):
""" find the lowest dtype that can accomodate the given types """
m = l[0].dtype
for x in l[1:]:
if x.dtype.itemsize > m.itemsize:
m = x.dtype
return m

have_int = len(counts[IntBlock]) > 0
have_bool = len(counts[BoolBlock]) > 0
have_object = len(counts[ObjectBlock]) > 0
Expand All @@ -4455,7 +4447,6 @@ def _lcd_dtype(l):
# TODO: have_sparse is not used
have_sparse = len(counts[SparseBlock]) > 0 # noqa
have_numeric = have_float or have_complex or have_int

has_non_numeric = have_dt64 or have_dt64_tz or have_td64 or have_cat

if (have_object or
Expand All @@ -4467,10 +4458,9 @@ def _lcd_dtype(l):
elif have_bool:
return np.dtype(bool)
elif have_int and not have_float and not have_complex:

# if we are mixing unsigned and signed, then return
# the next biggest int type (if we can)
lcd = _lcd_dtype(counts[IntBlock])
lcd = np.find_common_type([b.dtype for b in counts[IntBlock]], [])
kinds = set([i.dtype.kind for i in counts[IntBlock]])
if len(kinds) == 1:
return lcd
Expand All @@ -4486,7 +4476,8 @@ def _lcd_dtype(l):
elif have_complex:
return np.dtype('c16')
else:
return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock])
introspection_blks = counts[FloatBlock] + counts[SparseBlock]
return np.find_common_type([b.dtype for b in introspection_blks], [])


def _consolidate(blocks):
Expand Down
2 changes: 1 addition & 1 deletion pandas/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ class SparseArray(PandasObject, np.ndarray):
fill_value = None

def __new__(cls, data, sparse_index=None, index=None, kind='integer',
fill_value=None, dtype=np.float64, copy=False):
fill_value=None, dtype=None, copy=False):

if index is not None:
if data is None:
Expand Down
11 changes: 8 additions & 3 deletions pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,20 @@ def test_as_matrix_lcd(self):
values = self.mixed_float.as_matrix(['C'])
self.assertEqual(values.dtype, np.float16)

# B uint64 forces float because there are other signed int types
values = self.mixed_int.as_matrix(['A', 'B', 'C', 'D'])
self.assertEqual(values.dtype, np.int64)
self.assertEqual(values.dtype, np.float64)

values = self.mixed_int.as_matrix(['A', 'D'])
self.assertEqual(values.dtype, np.int64)

# guess all ints are cast to uints....
# B uint64 forces float because there are other signed int types
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this might fix another bug, can you search for uint64 issues and see?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add the issue as a reference here
and in the whatsnew

values = self.mixed_int.as_matrix(['A', 'B', 'C'])
self.assertEqual(values.dtype, np.int64)
self.assertEqual(values.dtype, np.float64)

# as B and C are both unsigned, no forcing to float is needed
values = self.mixed_int.as_matrix(['B', 'C'])
self.assertEqual(values.dtype, np.uint64)

values = self.mixed_int.as_matrix(['A', 'C'])
self.assertEqual(values.dtype, np.int32)
Expand Down
59 changes: 59 additions & 0 deletions pandas/tests/frame/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2713,6 +2713,65 @@ def test_type_error_multiindex(self):
assert_series_equal(result, expected)


class TestSparseDataFrameMultitype(tm.TestCase):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these need to go in pandas/spare/tests (maybe new file inside there if nothing) appropriate)

def setUp(self):
super(TestSparseDataFrameMultitype, self).setUp()
self.string_series = pd.SparseSeries(['a', 'b', 'c'])
self.int_series = pd.SparseSeries([1, 2, 3])
self.float_series = pd.SparseSeries([1.1, 1.2, 1.3])
self.object_series = pd.SparseSeries([[], {}, set()])
self.sdf = pd.SparseDataFrame({
'string': self.string_series,
'int': self.int_series,
'float': self.float_series,
'object': self.object_series,
})
self.cols = ['string', 'int', 'float', 'object']
self.sdf = self.sdf[self.cols]

def test_basic_dtypes(self):
for _, row in self.sdf.iterrows():
self.assertEqual(row.dtype, object)
tm.assert_sp_series_equal(self.sdf['string'], self.string_series,
check_names=False)
tm.assert_sp_series_equal(self.sdf['int'], self.int_series,
check_names=False)
tm.assert_sp_series_equal(self.sdf['float'], self.float_series,
check_names=False)
tm.assert_sp_series_equal(self.sdf['object'], self.object_series,
check_names=False)

def test_indexing_single(self):
tm.assert_sp_series_equal(self.sdf.iloc[0],
pd.SparseSeries(['a', 1, 1.1, []],
index=self.cols),
check_names=False)
tm.assert_sp_series_equal(self.sdf.iloc[1],
pd.SparseSeries(['b', 2, 1.2, {}],
index=self.cols),
check_names=False)
tm.assert_sp_series_equal(self.sdf.iloc[2],
pd.SparseSeries(['c', 3, 1.3, set()],
index=self.cols),
check_names=False)

def test_indexing_multiple(self):
tm.assert_sp_frame_equal(self.sdf, self.sdf[:])
tm.assert_sp_frame_equal(self.sdf, self.sdf.loc[:])
tm.assert_sp_frame_equal(self.sdf.iloc[[1, 2]],
pd.SparseDataFrame({
'string': ['b', 'c'],
'int': [2, 3],
'float': [1.2, 1.3],
'object': [{}, set()]
}, index=[1, 2])[self.cols])
tm.assert_sp_frame_equal(self.sdf[['int', 'string']],
pd.SparseDataFrame({
'int': self.int_series,
'string': self.string_series,
}))


class TestDataFrameIndexingDatetimeWithTZ(tm.TestCase, TestData):

_multiprocess_can_split_ = True
Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/series/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1856,3 +1856,29 @@ def test_multilevel_preserve_name(self):
result2 = s.ix['foo']
self.assertEqual(result.name, s.name)
self.assertEqual(result2.name, s.name)


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

class TestSparseSeriesMultitype(tm.TestCase):
def setUp(self):
super(TestSparseSeriesMultitype, self).setUp()
self.index = ['string', 'int', 'float', 'object']
self.ss = pd.SparseSeries(['a', 1, 1.1, []],
index=self.index)

def test_indexing_single(self):
for i, idx in enumerate(self.index):
self.assertEqual(self.ss.iloc[i], self.ss[idx])
self.assertEqual(type(self.ss.iloc[i]),
type(self.ss[idx]))
self.assertEqual(self.ss['string'], 'a')
self.assertEqual(self.ss['int'], 1)
self.assertEqual(self.ss['float'], 1.1)
self.assertEqual(self.ss['object'], [])

def test_indexing_multiple(self):
tm.assert_sp_series_equal(self.ss.loc[['string', 'int']],
pd.SparseSeries(['a', 1],
index=['string', 'int']))
tm.assert_sp_series_equal(self.ss.loc[['string', 'object']],
pd.SparseSeries(['a', []],
index=['string', 'object']))