From 7b2da4c6408e2866753599c875a16d8836d07df4 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 7 Nov 2018 10:41:21 -0600
Subject: [PATCH 1/8] BUG: astype fill_value for SparseArray.astype

I don't think we have a specific issue for this.

This fixes strange things like

```python
In [1]: import pandas as pd; import numpy as np

In [2]: a = pd.SparseArray([0, 1])

In [3]: a.astype(bool)
Out[3]:
[0, True]
Fill: 0
IntIndex
Indices: array([1], dtype=int32)
```
---
 pandas/core/arrays/sparse.py             | 34 +++++++++++++++++-------
 pandas/tests/arrays/sparse/test_array.py | 26 ++++++++++++++++++
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
index a63b3fb53625f..8f35222ad3b56 100644
--- a/pandas/core/arrays/sparse.py
+++ b/pandas/core/arrays/sparse.py
@@ -614,7 +614,7 @@ def __array__(self, dtype=None, copy=True):
                     # Can't put pd.NaT in a datetime64[ns]
                     fill_value = np.datetime64('NaT')
             try:
-                dtype = np.result_type(self.sp_values.dtype, fill_value)
+                dtype = np.result_type(self.sp_values.dtype, type(fill_value))
             except TypeError:
                 dtype = object
 
@@ -996,7 +996,7 @@ def _take_with_fill(self, indices, fill_value=None):
         if len(self) == 0:
             # Empty... Allow taking only if all empty
             if (indices == -1).all():
-                dtype = np.result_type(self.sp_values, fill_value)
+                dtype = np.result_type(self.sp_values, type(fill_value))
                 taken = np.empty_like(indices, dtype=dtype)
                 taken.fill(fill_value)
                 return taken
@@ -1009,7 +1009,7 @@ def _take_with_fill(self, indices, fill_value=None):
         if self.sp_index.npoints == 0:
             # Avoid taking from the empty self.sp_values
             taken = np.full(sp_indexer.shape, fill_value=fill_value,
-                            dtype=np.result_type(fill_value))
+                            dtype=np.result_type(type(fill_value)))
         else:
             taken = self.sp_values.take(sp_indexer)
 
@@ -1030,12 +1030,12 @@ def _take_with_fill(self, indices, fill_value=None):
             result_type = taken.dtype
 
             if m0.any():
-                result_type = np.result_type(result_type, self.fill_value)
+                result_type = np.result_type(result_type, type(self.fill_value))
                 taken = taken.astype(result_type)
                 taken[old_fill_indices] = self.fill_value
 
             if m1.any():
-                result_type = np.result_type(result_type, fill_value)
+                result_type = np.result_type(result_type, type(fill_value))
                 taken = taken.astype(result_type)
                 taken[new_fill_indices] = fill_value
 
@@ -1061,7 +1061,7 @@ def _take_without_fill(self, indices):
             # edge case in take...
             # I think just return
             out = np.full(indices.shape, self.fill_value,
-                          dtype=np.result_type(self.fill_value))
+                          dtype=np.result_type(type(self.fill_value)))
             arr, sp_index, fill_value = make_sparse(out,
                                                     fill_value=self.fill_value)
             return type(self)(arr, sparse_index=sp_index,
@@ -1073,7 +1073,7 @@ def _take_without_fill(self, indices):
 
         if fillable.any():
             # TODO: may need to coerce array to fill value
-            result_type = np.result_type(taken, self.fill_value)
+            result_type = np.result_type(taken, type(self.fill_value))
             taken = taken.astype(result_type)
             taken[fillable] = self.fill_value
 
@@ -1215,10 +1215,26 @@ def astype(self, dtype=None, copy=True):
         dtype = pandas_dtype(dtype)
 
         if not isinstance(dtype, SparseDtype):
-            dtype = SparseDtype(dtype, fill_value=self.fill_value)
+            fill_value = astype_nansafe(np.array(self.fill_value),
+                                        dtype).item()
+            dtype = SparseDtype(dtype, fill_value=fill_value)
+
+        # Typically we'll just astype the sp_values to dtype.subtype,
+        # but SparseDtype follows the pandas convention of storing strings
+        # as object dtype. So SparseDtype(str) immediately becomes
+        # SparseDtype(object), and at this point we don't know whether object
+        # means string or something else. We *cannot* just pass object to
+        # astype_nansafe below, since that won't convert to string. So
+        # we rely on the assumption that "string fill_value" means strings
+        # which is close enough to being true.
+        if (is_object_dtype(dtype.subtype) and
+                isinstance(dtype.fill_value, compat.text_type)):
+            subtype = str
+        else:
+            subtype = dtype.subtype
 
         sp_values = astype_nansafe(self.sp_values,
-                                   dtype.subtype,
+                                   subtype,
                                    copy=copy)
         if sp_values is self.sp_values and copy:
             sp_values = sp_values.copy()
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
index 852c4fb910560..53f2863b6d790 100644
--- a/pandas/tests/arrays/sparse/test_array.py
+++ b/pandas/tests/arrays/sparse/test_array.py
@@ -468,6 +468,32 @@ def test_astype_all(self, any_real_dtype):
         tm.assert_numpy_array_equal(np.asarray(res.values),
                                     vals.astype(typ))
 
+    @pytest.mark.parametrize('array, dtype, expected', [
+        (SparseArray([0, 1]), 'float',
+         SparseArray([0., 1.], dtype=SparseDtype(float, 0.0))),
+        (SparseArray([0, 1]), bool, SparseArray([False, True])),
+        (SparseArray([0, 1], fill_value=1), bool,
+         SparseArray([False, True], dtype=SparseDtype(bool, True))),
+        pytest.param(
+            SparseArray([0, 1]), 'datetime64[ns]',
+            SparseArray(np.array([0, 1], dtype='datetime64[ns]'),
+                         dtype=SparseDtype('datetime64[ns]',
+                                           pd.Timestamp('1970'))),
+            marks=[pytest.mark.xfail(reason="NumPy-7619", strict=True)],
+        ),
+        (SparseArray([0, 1, 10]), str,
+         SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))),
+        (SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])),
+    ])
+    def test_astype_more(self, array, dtype, expected):
+        result = array.astype(dtype)
+        tm.assert_sp_array_equal(result, expected)
+
+    def test_astype_nan_raises(self):
+        arr = SparseArray([1.0, np.nan])
+        with tm.assert_raises_regex(ValueError, 'Cannot convert non-finite'):
+            arr.astype(int)
+
     def test_set_fill_value(self):
         arr = SparseArray([1., np.nan, 2.], fill_value=np.nan)
         arr.fill_value = 2

From 232921b86d6348a1618857e28609b9675b2cffa2 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 7 Nov 2018 11:12:01 -0600
Subject: [PATCH 2/8] object type, lint

---
 pandas/core/arrays/sparse.py             | 3 ++-
 pandas/tests/arrays/sparse/test_array.py | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
index 8f35222ad3b56..d4936409bdb7f 100644
--- a/pandas/core/arrays/sparse.py
+++ b/pandas/core/arrays/sparse.py
@@ -1030,7 +1030,8 @@ def _take_with_fill(self, indices, fill_value=None):
             result_type = taken.dtype
 
             if m0.any():
-                result_type = np.result_type(result_type, type(self.fill_value))
+                result_type = np.result_type(result_type,
+                                             type(self.fill_value))
                 taken = taken.astype(result_type)
                 taken[old_fill_indices] = self.fill_value
 
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
index 53f2863b6d790..f8dcf2186e62b 100644
--- a/pandas/tests/arrays/sparse/test_array.py
+++ b/pandas/tests/arrays/sparse/test_array.py
@@ -477,13 +477,15 @@ def test_astype_all(self, any_real_dtype):
         pytest.param(
             SparseArray([0, 1]), 'datetime64[ns]',
             SparseArray(np.array([0, 1], dtype='datetime64[ns]'),
-                         dtype=SparseDtype('datetime64[ns]',
-                                           pd.Timestamp('1970'))),
+                        dtype=SparseDtype('datetime64[ns]',
+                                          pd.Timestamp('1970'))),
             marks=[pytest.mark.xfail(reason="NumPy-7619", strict=True)],
         ),
         (SparseArray([0, 1, 10]), str,
          SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))),
         (SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])),
+        (SparseArray([0, 1, 0]), object,
+         SparseArray([0, 1, 0], dtype=SparseDtype(object, 0))),
     ])
     def test_astype_more(self, array, dtype, expected):
         result = array.astype(dtype)

From 7454e31a904b886b209c2d835437c33651e36026 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 7 Nov 2018 16:04:35 -0600
Subject: [PATCH 3/8] text

---
 pandas/core/arrays/sparse.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
index d4936409bdb7f..d69f51bf8837f 100644
--- a/pandas/core/arrays/sparse.py
+++ b/pandas/core/arrays/sparse.py
@@ -1229,8 +1229,8 @@ def astype(self, dtype=None, copy=True):
         # we rely on the assumption that "string fill_value" means strings
         # which is close enough to being true.
         if (is_object_dtype(dtype.subtype) and
-                isinstance(dtype.fill_value, compat.text_type)):
-            subtype = str
+                isinstance(dtype.fill_value, compat.string_types)):
+            subtype = compat.text_type
         else:
             subtype = dtype.subtype
 

From 1cc43d63c83787870a3f91dbb353f478f8e69849 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sun, 11 Nov 2018 11:45:21 -0600
Subject: [PATCH 4/8] Moved to astype

---
 pandas/core/arrays/sparse.py             | 100 ++++++++++++++++++-----
 pandas/tests/arrays/sparse/test_dtype.py |  20 +++++
 2 files changed, 99 insertions(+), 21 deletions(-)

diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
index d69f51bf8837f..3500144f979ad 100644
--- a/pandas/core/arrays/sparse.py
+++ b/pandas/core/arrays/sparse.py
@@ -284,6 +284,83 @@ def is_dtype(cls, dtype):
             return True
         return isinstance(dtype, np.dtype) or dtype == 'Sparse'
 
+    def astype(self, dtype):
+        """Convert the SparseDtype to a new dtype.
+
+        This takes care of converting the ``fill_value``.
+
+        Parameters
+        ----------
+        dtype : Union[str, numpy.dtype, SparseDtype]
+            The new dtype to use.
+
+            * For a SparseDtype, it is simply returned
+            * For a NumPy dtype (or str), the current fill value
+              is converted to the new dtype, and a SparseDtype
+              with `dtype` and the new fill value is returned.
+
+        Returns
+        -------
+        SparseDtype
+            A new SparseDtype with the corret `dtype` and fill value
+            for that `dtype`.
+
+        Raises
+        ------
+        ValueError
+            When the current fill value cannot be converted to the
+            new `dtype` (e.g. trying to convert ``np.nan`` to an
+            integer dtype.
+
+
+        Examples
+        --------
+        >>> SparseDtype(int, 0).astype(float)
+        Sparse[float64, 0.0]
+
+        >>> SparseDtype(int, 1).astype(SparseDtype(float, np.nan))
+        Sparse[float64, nan]
+        """
+        cls = type(self)
+        dtype = pandas_dtype(dtype)
+
+        if not isinstance(dtype, cls):
+            fill_value = astype_nansafe(np.array(self.fill_value),
+                                        dtype).item()
+            dtype = cls(dtype, fill_value=fill_value)
+
+        return dtype
+
+    @property
+    def _subtype_with_str(self):
+        """
+        Whether the SparseDtype's subtype should be considered ``str``.
+
+        Typically, pandas will store string data in an object-dtype array.
+        When converting values to a dtype, e.g. in ``.astype``, we need to
+        be more specific, we need the actual underlying type.
+
+        Returns
+        -------
+
+        >>> SparseDtype(int, 1)._subtype_with_str
+        dtype('int64')
+
+        >>> SparseDtype(object, 1)._subtype_with_str
+        dtype('O')
+
+        >>> dtype = SparseDtype(str, '')
+        >>> dtype.subtype
+        dtype('O')
+
+        >>> dtype._subtype_with_str
+        str
+        """
+        if isinstance(self.fill_value, compat.string_types):
+            return type(self.fill_value)
+        return self.subtype
+
+
 # ----------------------------------------------------------------------------
 # Array
 
@@ -1213,27 +1290,8 @@ def astype(self, dtype=None, copy=True):
         IntIndex
         Indices: array([2, 3], dtype=int32)
         """
-        dtype = pandas_dtype(dtype)
-
-        if not isinstance(dtype, SparseDtype):
-            fill_value = astype_nansafe(np.array(self.fill_value),
-                                        dtype).item()
-            dtype = SparseDtype(dtype, fill_value=fill_value)
-
-        # Typically we'll just astype the sp_values to dtype.subtype,
-        # but SparseDtype follows the pandas convention of storing strings
-        # as object dtype. So SparseDtype(str) immediately becomes
-        # SparseDtype(object), and at this point we don't know whether object
-        # means string or something else. We *cannot* just pass object to
-        # astype_nansafe below, since that won't convert to string. So
-        # we rely on the assumption that "string fill_value" means strings
-        # which is close enough to being true.
-        if (is_object_dtype(dtype.subtype) and
-                isinstance(dtype.fill_value, compat.string_types)):
-            subtype = compat.text_type
-        else:
-            subtype = dtype.subtype
-
+        dtype = self.dtype.astype(dtype)
+        subtype = dtype._subtype_with_str
         sp_values = astype_nansafe(self.sp_values,
                                    subtype,
                                    copy=copy)
diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py
index 7c310693cf26c..d834129652f8c 100644
--- a/pandas/tests/arrays/sparse/test_dtype.py
+++ b/pandas/tests/arrays/sparse/test_dtype.py
@@ -139,3 +139,23 @@ def test_parse_subtype(string, expected):
 def test_construct_from_string_fill_value_raises(string):
     with pytest.raises(TypeError, match='fill_value in the string is not'):
         SparseDtype.construct_from_string(string)
+
+
+@pytest.mark.parametrize('original, dtype, expected', [
+    (SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
+    (SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
+    (SparseDtype(int, 1), str, SparseDtype(object, '1')),
+    (SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
+])
+def test_astype(original, dtype, expected):
+    result = original.astype(dtype)
+    assert result == expected
+
+
+@pytest.mark.parametrize("original, dtype", [
+    (SparseDtype(float, np.nan), int),
+    (SparseDtype(str, 'abc'), int),
+])
+def test_astype_raises(original, dtype):
+    with pytest.raises(ValueError):
+        original.astype(dtype)

From 57d32ae3da2d19317615a3c0a390a9546ad95149 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sun, 11 Nov 2018 11:49:19 -0600
Subject: [PATCH 5/8] closing paren

---
 pandas/core/arrays/sparse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
index 3500144f979ad..36396992b3663 100644
--- a/pandas/core/arrays/sparse.py
+++ b/pandas/core/arrays/sparse.py
@@ -310,7 +310,7 @@ def astype(self, dtype):
         ValueError
             When the current fill value cannot be converted to the
             new `dtype` (e.g. trying to convert ``np.nan`` to an
-            integer dtype.
+            integer dtype).
 
 
         Examples

From d93d98f0a5a264452f36701b4e7578d225e6b60e Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sun, 11 Nov 2018 14:41:04 -0600
Subject: [PATCH 6/8] astype -> update_dtype

---
 pandas/core/arrays/sparse.py             | 8 ++++----
 pandas/tests/arrays/sparse/test_dtype.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
index 36396992b3663..4148cb1f448a3 100644
--- a/pandas/core/arrays/sparse.py
+++ b/pandas/core/arrays/sparse.py
@@ -284,7 +284,7 @@ def is_dtype(cls, dtype):
             return True
         return isinstance(dtype, np.dtype) or dtype == 'Sparse'
 
-    def astype(self, dtype):
+    def update_dtype(self, dtype):
         """Convert the SparseDtype to a new dtype.
 
         This takes care of converting the ``fill_value``.
@@ -315,10 +315,10 @@ def astype(self, dtype):
 
         Examples
         --------
-        >>> SparseDtype(int, 0).astype(float)
+        >>> SparseDtype(int, 0).update_dtype(float)
         Sparse[float64, 0.0]
 
-        >>> SparseDtype(int, 1).astype(SparseDtype(float, np.nan))
+        >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
         Sparse[float64, nan]
         """
         cls = type(self)
@@ -1290,7 +1290,7 @@ def astype(self, dtype=None, copy=True):
         IntIndex
         Indices: array([2, 3], dtype=int32)
         """
-        dtype = self.dtype.astype(dtype)
+        dtype = self.dtype.update_dtype(dtype)
         subtype = dtype._subtype_with_str
         sp_values = astype_nansafe(self.sp_values,
                                    subtype,
diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py
index d834129652f8c..2d386de0d31a3 100644
--- a/pandas/tests/arrays/sparse/test_dtype.py
+++ b/pandas/tests/arrays/sparse/test_dtype.py
@@ -147,8 +147,8 @@ def test_construct_from_string_fill_value_raises(string):
     (SparseDtype(int, 1), str, SparseDtype(object, '1')),
     (SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
 ])
-def test_astype(original, dtype, expected):
-    result = original.astype(dtype)
+def test_update_dtype(original, dtype, expected):
+    result = original.update_dtype(dtype)
     assert result == expected
 
 
@@ -156,6 +156,6 @@ def test_astype(original, dtype, expected):
     (SparseDtype(float, np.nan), int),
     (SparseDtype(str, 'abc'), int),
 ])
-def test_astype_raises(original, dtype):
+def test_update_dtype_raises(original, dtype):
     with pytest.raises(ValueError):
-        original.astype(dtype)
+        original.update_dtype(dtype)

From 4f4b3a3fe9646bf64f77eb05757a2da35b896a3a Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sun, 11 Nov 2018 15:02:36 -0600
Subject: [PATCH 7/8] pytest.raises

---
 pandas/tests/arrays/sparse/test_array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
index d7e4807c8e816..0e5a8280cc467 100644
--- a/pandas/tests/arrays/sparse/test_array.py
+++ b/pandas/tests/arrays/sparse/test_array.py
@@ -502,7 +502,7 @@ def test_astype_more(self, array, dtype, expected):
 
     def test_astype_nan_raises(self):
         arr = SparseArray([1.0, np.nan])
-        with tm.assert_raises_regex(ValueError, 'Cannot convert non-finite'):
+        with pytest.raises(ValueError, match='Cannot convert non-finite'):
             arr.astype(int)
 
     def test_set_fill_value(self):

From 3dfc07e0161d9d0a4745ede5ab60e1485522bb22 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 12 Nov 2018 05:34:31 -0600
Subject: [PATCH 8/8] handle nan

---
 pandas/core/arrays/sparse.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
index 4148cb1f448a3..672261c2a407e 100644
--- a/pandas/core/arrays/sparse.py
+++ b/pandas/core/arrays/sparse.py
@@ -1171,7 +1171,9 @@ def _concat_same_type(cls, to_concat):
 
         fill_value = fill_values[0]
 
-        if len(set(fill_values)) > 1:
+        # np.nan isn't a singleton, so we may end up with multiple
+        # NaNs here, so we ignore tha all NA case too.
+        if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
             warnings.warn("Concatenating sparse arrays with multiple fill "
                           "values: '{}'. Picking the first and "
                           "converting the rest.".format(fill_values),