Skip to content

Commit 1c669ad

Browse files
committed
ENH: sparse astype now supports int64 and bool
1 parent 45d54d0 commit 1c669ad

File tree

7 files changed

+189
-31
lines changed

7 files changed

+189
-31
lines changed

doc/source/whatsnew/v0.19.0.txt

+18-1
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,24 @@ These changes allow pandas to handle sparse data with more dtypes, and for work
323323

324324
s + 1
325325

326+
- Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`)
326327

328+
.. ipython:: python
329+
330+
s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0)
331+
s
332+
s.astype(np.int64)
333+
334+
``astype`` fails if data contains values which cannot be converted to specified ``dtype``.
335+
Note that the limitation is applied to ``fill_value`` which default is ``np.nan``.
336+
337+
.. code-block:: ipython
338+
339+
In [7]: pd.SparseSeries([1., np.nan, 2., np.nan], fill_value=np.nan).astype(np.int64)
340+
Out[7]:
341+
ValueError: unable to coerce current fill_value nan to int64 dtype
342+
343+
- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`)
327344
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
328345
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
329346
- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)
@@ -411,7 +428,7 @@ API changes
411428
- ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`)
412429
- ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`)
413430
- ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`)
414-
- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`)
431+
415432

416433

417434

pandas/core/internals.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -2504,6 +2504,14 @@ def sp_index(self):
25042504
def kind(self):
25052505
return self.values.kind
25062506

2507+
def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
2508+
klass=None, mgr=None, **kwargs):
2509+
if values is None:
2510+
values = self.values
2511+
values = values.astype(dtype, copy=copy)
2512+
return self.make_block_same_class(values=values,
2513+
placement=self.mgr_locs)
2514+
25072515
def __len__(self):
25082516
try:
25092517
return self.sp_index.length
@@ -2521,7 +2529,7 @@ def make_block_same_class(self, values, placement, sparse_index=None,
25212529
copy=False, fastpath=True, **kwargs):
25222530
""" return a new block """
25232531
if dtype is None:
2524-
dtype = self.dtype
2532+
dtype = values.dtype
25252533
if fill_value is None and not isinstance(values, SparseArray):
25262534
fill_value = self.values.fill_value
25272535

pandas/sparse/array.py

+31-17
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@
1818
from pandas.types.common import (is_float, is_integer,
1919
is_integer_dtype, _ensure_platform_int,
2020
is_list_like,
21-
is_scalar)
22-
from pandas.types.cast import _possibly_convert_platform
21+
is_scalar, is_dtype_equal)
22+
from pandas.types.cast import (_possibly_convert_platform, _maybe_promote,
23+
_astype_nansafe)
2324
from pandas.types.missing import isnull, notnull
2425

2526
from pandas._sparse import SparseIndex, BlockIndex, IntIndex
@@ -236,7 +237,7 @@ def _simple_new(cls, data, sp_index, fill_value):
236237
raise ValueError('sp_index must be a SparseIndex')
237238

238239
result.sp_index = sp_index
239-
result.fill_value = fill_value
240+
result._fill_value = fill_value
240241
return result
241242

242243
@property
@@ -285,7 +286,7 @@ def __array_finalize__(self, obj):
285286
to pass on the index.
286287
"""
287288
self.sp_index = getattr(obj, 'sp_index', None)
288-
self.fill_value = getattr(obj, 'fill_value', None)
289+
self._fill_value = getattr(obj, 'fill_value', None)
289290

290291
def __reduce__(self):
291292
"""Necessary for making this object picklable"""
@@ -301,7 +302,7 @@ def __setstate__(self, state):
301302

302303
fill_value, sp_index = own_state[:2]
303304
self.sp_index = sp_index
304-
self.fill_value = fill_value
305+
self._fill_value = fill_value
305306

306307
def __len__(self):
307308
try:
@@ -344,6 +345,22 @@ def sp_values(self):
344345
# caching not an option, leaks memory
345346
return self.view(np.ndarray)
346347

348+
@property
349+
def fill_value(self):
350+
return self._fill_value
351+
352+
@fill_value.setter
353+
def fill_value(self, value):
354+
if not is_scalar(value):
355+
raise ValueError('fill_value must be a scalar')
356+
# if the specified value triggers type promotion, raise ValueError
357+
new_dtype, fill_value = _maybe_promote(self.dtype, value)
358+
if is_dtype_equal(self.dtype, new_dtype):
359+
self._fill_value = fill_value
360+
else:
361+
msg = 'unable to set fill_value {0} to {1} dtype'
362+
raise ValueError(msg.format(value, self.dtype))
363+
347364
def get_values(self, fill=None):
348365
""" return a dense representation """
349366
return self.to_dense(fill=fill)
@@ -479,19 +496,16 @@ def __setslice__(self, i, j, value):
479496
raise TypeError("SparseArray does not support item assignment via "
480497
"slices")
481498

482-
def astype(self, dtype=None):
483-
"""
484-
485-
"""
499+
def astype(self, dtype=None, copy=True):
486500
dtype = np.dtype(dtype)
487-
if dtype is not None and dtype not in (np.float_, float):
488-
raise TypeError('Can only support floating point data for now')
489-
490-
if self.dtype == dtype:
491-
return self.copy()
492-
else:
493-
return self._simple_new(self.sp_values.astype(dtype),
494-
self.sp_index, float(self.fill_value))
501+
sp_values = _astype_nansafe(self.sp_values, dtype, copy=copy)
502+
try:
503+
fill_value = dtype.type(self.fill_value)
504+
except ValueError:
505+
msg = 'unable to coerce current fill_value {0} to {1} dtype'
506+
raise ValueError(msg.format(self.fill_value, dtype))
507+
return self._simple_new(sp_values, self.sp_index,
508+
fill_value=fill_value)
495509

496510
def copy(self, deep=True):
497511
"""

pandas/sparse/frame.py

+13-8
Original file line numberDiff line numberDiff line change
@@ -235,8 +235,19 @@ def to_dense(self):
235235
data = dict((k, v.to_dense()) for k, v in compat.iteritems(self))
236236
return DataFrame(data, index=self.index, columns=self.columns)
237237

238+
def _apply_columns(self, func):
239+
""" get new SparseDataFrame applying func to each columns """
240+
241+
new_data = {}
242+
for col, series in compat.iteritems(self):
243+
new_data[col] = func(series)
244+
245+
return self._constructor(
246+
data=new_data, index=self.index, columns=self.columns,
247+
default_fill_value=self.default_fill_value).__finalize__(self)
248+
238249
def astype(self, dtype):
239-
raise NotImplementedError
250+
return self._apply_columns(lambda x: x.astype(dtype))
240251

241252
def copy(self, deep=True):
242253
"""
@@ -499,13 +510,7 @@ def _combine_match_columns(self, other, func, level=None, fill_value=None):
499510
default_fill_value=self.default_fill_value).__finalize__(self)
500511

501512
def _combine_const(self, other, func):
502-
new_data = {}
503-
for col, series in compat.iteritems(self):
504-
new_data[col] = func(series, other)
505-
506-
return self._constructor(
507-
data=new_data, index=self.index, columns=self.columns,
508-
default_fill_value=self.default_fill_value).__finalize__(self)
513+
return self._apply_columns(lambda x: func(x, other))
509514

510515
def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
511516
limit=None, takeable=False):

pandas/sparse/tests/test_array.py

+62-1
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,68 @@ def test_astype(self):
324324
res.sp_values[:3] = 27
325325
self.assertFalse((self.arr.sp_values[:3] == 27).any())
326326

327-
assertRaisesRegexp(TypeError, "floating point", self.arr.astype, 'i8')
327+
msg = "unable to coerce current fill_value nan to int64 dtype"
328+
with tm.assertRaisesRegexp(ValueError, msg):
329+
self.arr.astype('i8')
330+
331+
arr = SparseArray([0, np.nan, 0, 1])
332+
with tm.assertRaisesRegexp(ValueError, msg):
333+
arr.astype('i8')
334+
335+
arr = SparseArray([0, np.nan, 0, 1], fill_value=0)
336+
msg = "Cannot convert NA to integer"
337+
with tm.assertRaisesRegexp(ValueError, msg):
338+
arr.astype('i8')
339+
340+
def test_astype_all(self):
341+
vals = np.array([1, 2, 3])
342+
arr = SparseArray(vals, fill_value=1)
343+
344+
types = [np.float64, np.float32, np.int64,
345+
np.int32, np.int16, np.int8]
346+
for typ in types:
347+
res = arr.astype(typ)
348+
self.assertEqual(res.dtype, typ)
349+
self.assertEqual(res.sp_values.dtype, typ)
350+
351+
tm.assert_numpy_array_equal(res.values, vals.astype(typ))
352+
353+
def test_set_fill_value(self):
354+
arr = SparseArray([1., np.nan, 2.], fill_value=np.nan)
355+
arr.fill_value = 2
356+
self.assertEqual(arr.fill_value, 2)
357+
358+
arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64)
359+
arr.fill_value = 2
360+
self.assertEqual(arr.fill_value, 2)
361+
362+
# coerces to int
363+
msg = "unable to set fill_value 3\\.1 to int64 dtype"
364+
with tm.assertRaisesRegexp(ValueError, msg):
365+
arr.fill_value = 3.1
366+
367+
msg = "unable to set fill_value nan to int64 dtype"
368+
with tm.assertRaisesRegexp(ValueError, msg):
369+
arr.fill_value = np.nan
370+
371+
arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool)
372+
arr.fill_value = True
373+
self.assertTrue(arr.fill_value)
374+
375+
# coerces to bool
376+
msg = "unable to set fill_value 0 to bool dtype"
377+
with tm.assertRaisesRegexp(ValueError, msg):
378+
arr.fill_value = 0
379+
380+
msg = "unable to set fill_value nan to bool dtype"
381+
with tm.assertRaisesRegexp(ValueError, msg):
382+
arr.fill_value = np.nan
383+
384+
# invalid
385+
msg = "fill_value must be a scalar"
386+
for val in [[1, 2, 3], np.array([1, 2]), (1, 2, 3)]:
387+
with tm.assertRaisesRegexp(ValueError, msg):
388+
arr.fill_value = val
328389

329390
def test_copy_shallow(self):
330391
arr2 = self.arr.copy(deep=False)

pandas/sparse/tests/test_frame.py

+54-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import pandas.sparse.frame as spf
1616

1717
from pandas._sparse import BlockIndex, IntIndex
18-
from pandas.sparse.api import SparseSeries, SparseDataFrame
18+
from pandas.sparse.api import SparseSeries, SparseDataFrame, SparseArray
1919
from pandas.tests.frame.test_misc_api import SharedWithSparse
2020

2121

@@ -588,7 +588,59 @@ def test_applymap(self):
588588
tm.assertIsInstance(result, SparseDataFrame)
589589

590590
def test_astype(self):
591-
self.assertRaises(Exception, self.frame.astype, np.int64)
591+
sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4],
592+
dtype=np.int64),
593+
'B': SparseArray([4, 5, 6, 7],
594+
dtype=np.int64)})
595+
self.assertEqual(sparse['A'].dtype, np.int64)
596+
self.assertEqual(sparse['B'].dtype, np.int64)
597+
598+
res = sparse.astype(np.float64)
599+
exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.]),
600+
'B': SparseArray([4., 5., 6., 7.])},
601+
default_fill_value=np.nan)
602+
tm.assert_sp_frame_equal(res, exp)
603+
self.assertEqual(res['A'].dtype, np.float64)
604+
self.assertEqual(res['B'].dtype, np.float64)
605+
606+
sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4],
607+
dtype=np.int64),
608+
'B': SparseArray([0, 5, 0, 7],
609+
dtype=np.int64)},
610+
default_fill_value=0)
611+
self.assertEqual(sparse['A'].dtype, np.int64)
612+
self.assertEqual(sparse['B'].dtype, np.int64)
613+
614+
res = sparse.astype(np.float64)
615+
exp = pd.SparseDataFrame({'A': SparseArray([0., 2., 0., 4.]),
616+
'B': SparseArray([0., 5., 0., 7.])},
617+
default_fill_value=0.)
618+
tm.assert_sp_frame_equal(res, exp)
619+
self.assertEqual(res['A'].dtype, np.float64)
620+
self.assertEqual(res['B'].dtype, np.float64)
621+
622+
def test_astype_bool(self):
623+
sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4],
624+
fill_value=0,
625+
dtype=np.int64),
626+
'B': SparseArray([0, 5, 0, 7],
627+
fill_value=0,
628+
dtype=np.int64)},
629+
default_fill_value=0)
630+
self.assertEqual(sparse['A'].dtype, np.int64)
631+
self.assertEqual(sparse['B'].dtype, np.int64)
632+
633+
res = sparse.astype(bool)
634+
exp = pd.SparseDataFrame({'A': SparseArray([False, True, False, True],
635+
dtype=np.bool,
636+
fill_value=False),
637+
'B': SparseArray([False, True, False, True],
638+
dtype=np.bool,
639+
fill_value=False)},
640+
default_fill_value=False)
641+
tm.assert_sp_frame_equal(res, exp)
642+
self.assertEqual(res['A'].dtype, np.bool)
643+
self.assertEqual(res['B'].dtype, np.bool)
592644

593645
def test_fillna(self):
594646
df = self.zframe.reindex(lrange(5))

pandas/sparse/tests/test_series.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -797,7 +797,8 @@ def test_fill_value_corner(self):
797797
cop2 = self.zbseries.copy()
798798
cop2.fill_value = 1
799799
result = cop2 / cop
800-
self.assertEqual(result.fill_value, np.inf)
800+
# 1 / 0 is inf
801+
self.assertTrue(np.isinf(result.fill_value))
801802

802803
def test_fill_value_when_combine_const(self):
803804
# GH12723

0 commit comments

Comments
 (0)