Skip to content

BUG: astype fill_value for SparseArray.astype #23547

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Nov 12, 2018
101 changes: 88 additions & 13 deletions pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,83 @@ def is_dtype(cls, dtype):
return True
return isinstance(dtype, np.dtype) or dtype == 'Sparse'

def astype(self, dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make a method on the base Dtype class as well which just returns .dtype

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as this will make it an offical part of the interface.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any other types that this would be useful for? IMO it's not important enough to add to the interface.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

anything with a subtype? so Categorical and Interval?

"""Convert the SparseDtype to a new dtype.

This takes care of converting the ``fill_value``.

Parameters
----------
dtype : Union[str, numpy.dtype, SparseDtype]
The new dtype to use.

* For a SparseDtype, it is simply returned
* For a NumPy dtype (or str), the current fill value
is converted to the new dtype, and a SparseDtype
with `dtype` and the new fill value is returned.

Returns
-------
SparseDtype
A new SparseDtype with the corret `dtype` and fill value
for that `dtype`.

Raises
------
ValueError
When the current fill value cannot be converted to the
new `dtype` (e.g. trying to convert ``np.nan`` to an
integer dtype).


Examples
--------
>>> SparseDtype(int, 0).astype(float)
Sparse[float64, 0.0]

>>> SparseDtype(int, 1).astype(SparseDtype(float, np.nan))
Sparse[float64, nan]
"""
cls = type(self)
dtype = pandas_dtype(dtype)

if not isinstance(dtype, cls):
fill_value = astype_nansafe(np.array(self.fill_value),
dtype).item()
dtype = cls(dtype, fill_value=fill_value)

return dtype

@property
def _subtype_with_str(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this is only for Sparse which is ok

"""
Whether the SparseDtype's subtype should be considered ``str``.

Typically, pandas will store string data in an object-dtype array.
When converting values to a dtype, e.g. in ``.astype``, we need to
be more specific, we need the actual underlying type.

Returns
-------

>>> SparseDtype(int, 1)._subtype_with_str
dtype('int64')

>>> SparseDtype(object, 1)._subtype_with_str
dtype('O')

>>> dtype = SparseDtype(str, '')
>>> dtype.subtype
dtype('O')

>>> dtype._subtype_with_str
str
"""
if isinstance(self.fill_value, compat.string_types):
return type(self.fill_value)
return self.subtype


# ----------------------------------------------------------------------------
# Array

Expand Down Expand Up @@ -614,7 +691,7 @@ def __array__(self, dtype=None, copy=True):
# Can't put pd.NaT in a datetime64[ns]
fill_value = np.datetime64('NaT')
try:
dtype = np.result_type(self.sp_values.dtype, fill_value)
dtype = np.result_type(self.sp_values.dtype, type(fill_value))
Copy link
Contributor Author

@TomAugspurger TomAugspurger Nov 7, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was having trouble with string fill values.

except TypeError:
dtype = object

Expand Down Expand Up @@ -996,7 +1073,7 @@ def _take_with_fill(self, indices, fill_value=None):
if len(self) == 0:
# Empty... Allow taking only if all empty
if (indices == -1).all():
dtype = np.result_type(self.sp_values, fill_value)
dtype = np.result_type(self.sp_values, type(fill_value))
taken = np.empty_like(indices, dtype=dtype)
taken.fill(fill_value)
return taken
Expand All @@ -1009,7 +1086,7 @@ def _take_with_fill(self, indices, fill_value=None):
if self.sp_index.npoints == 0:
# Avoid taking from the empty self.sp_values
taken = np.full(sp_indexer.shape, fill_value=fill_value,
dtype=np.result_type(fill_value))
dtype=np.result_type(type(fill_value)))
else:
taken = self.sp_values.take(sp_indexer)

Expand All @@ -1030,12 +1107,13 @@ def _take_with_fill(self, indices, fill_value=None):
result_type = taken.dtype

if m0.any():
result_type = np.result_type(result_type, self.fill_value)
result_type = np.result_type(result_type,
type(self.fill_value))
taken = taken.astype(result_type)
taken[old_fill_indices] = self.fill_value

if m1.any():
result_type = np.result_type(result_type, fill_value)
result_type = np.result_type(result_type, type(fill_value))
taken = taken.astype(result_type)
taken[new_fill_indices] = fill_value

Expand All @@ -1061,7 +1139,7 @@ def _take_without_fill(self, indices):
# edge case in take...
# I think just return
out = np.full(indices.shape, self.fill_value,
dtype=np.result_type(self.fill_value))
dtype=np.result_type(type(self.fill_value)))
arr, sp_index, fill_value = make_sparse(out,
fill_value=self.fill_value)
return type(self)(arr, sparse_index=sp_index,
Expand All @@ -1073,7 +1151,7 @@ def _take_without_fill(self, indices):

if fillable.any():
# TODO: may need to coerce array to fill value
result_type = np.result_type(taken, self.fill_value)
result_type = np.result_type(taken, type(self.fill_value))
taken = taken.astype(result_type)
taken[fillable] = self.fill_value

Expand Down Expand Up @@ -1212,13 +1290,10 @@ def astype(self, dtype=None, copy=True):
IntIndex
Indices: array([2, 3], dtype=int32)
"""
dtype = pandas_dtype(dtype)

if not isinstance(dtype, SparseDtype):
dtype = SparseDtype(dtype, fill_value=self.fill_value)

dtype = self.dtype.astype(dtype)
subtype = dtype._subtype_with_str
sp_values = astype_nansafe(self.sp_values,
dtype.subtype,
subtype,
copy=copy)
if sp_values is self.sp_values and copy:
sp_values = sp_values.copy()
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,34 @@ def test_astype_all(self, any_real_dtype):
tm.assert_numpy_array_equal(np.asarray(res.values),
vals.astype(typ))

@pytest.mark.parametrize('array, dtype, expected', [
(SparseArray([0, 1]), 'float',
SparseArray([0., 1.], dtype=SparseDtype(float, 0.0))),
(SparseArray([0, 1]), bool, SparseArray([False, True])),
(SparseArray([0, 1], fill_value=1), bool,
SparseArray([False, True], dtype=SparseDtype(bool, True))),
pytest.param(
SparseArray([0, 1]), 'datetime64[ns]',
SparseArray(np.array([0, 1], dtype='datetime64[ns]'),
dtype=SparseDtype('datetime64[ns]',
pd.Timestamp('1970'))),
marks=[pytest.mark.xfail(reason="NumPy-7619", strict=True)],
),
(SparseArray([0, 1, 10]), str,
SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))),
(SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])),
(SparseArray([0, 1, 0]), object,
SparseArray([0, 1, 0], dtype=SparseDtype(object, 0))),
])
def test_astype_more(self, array, dtype, expected):
result = array.astype(dtype)
tm.assert_sp_array_equal(result, expected)

def test_astype_nan_raises(self):
arr = SparseArray([1.0, np.nan])
with tm.assert_raises_regex(ValueError, 'Cannot convert non-finite'):
arr.astype(int)

def test_set_fill_value(self):
arr = SparseArray([1., np.nan, 2.], fill_value=np.nan)
arr.fill_value = 2
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/arrays/sparse/test_dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,23 @@ def test_parse_subtype(string, expected):
def test_construct_from_string_fill_value_raises(string):
with pytest.raises(TypeError, match='fill_value in the string is not'):
SparseDtype.construct_from_string(string)


@pytest.mark.parametrize('original, dtype, expected', [
(SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
(SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
(SparseDtype(int, 1), str, SparseDtype(object, '1')),
(SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
])
def test_astype(original, dtype, expected):
result = original.astype(dtype)
assert result == expected


@pytest.mark.parametrize("original, dtype", [
(SparseDtype(float, np.nan), int),
(SparseDtype(str, 'abc'), int),
])
def test_astype_raises(original, dtype):
with pytest.raises(ValueError):
original.astype(dtype)