From 7b2da4c6408e2866753599c875a16d8836d07df4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Nov 2018 10:41:21 -0600 Subject: [PATCH 1/8] BUG: astype fill_value for SparseArray.astype I don't think we have a specific issue for this. This fixes strange things like ```python In [1]: import pandas as pd; import numpy as np In [2]: a = pd.SparseArray([0, 1]) In [3]: a.astype(bool) Out[3]: [0, True] Fill: 0 IntIndex Indices: array([1], dtype=int32) ``` --- pandas/core/arrays/sparse.py | 34 +++++++++++++++++------- pandas/tests/arrays/sparse/test_array.py | 26 ++++++++++++++++++ 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index a63b3fb53625f..8f35222ad3b56 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -614,7 +614,7 @@ def __array__(self, dtype=None, copy=True): # Can't put pd.NaT in a datetime64[ns] fill_value = np.datetime64('NaT') try: - dtype = np.result_type(self.sp_values.dtype, fill_value) + dtype = np.result_type(self.sp_values.dtype, type(fill_value)) except TypeError: dtype = object @@ -996,7 +996,7 @@ def _take_with_fill(self, indices, fill_value=None): if len(self) == 0: # Empty... Allow taking only if all empty if (indices == -1).all(): - dtype = np.result_type(self.sp_values, fill_value) + dtype = np.result_type(self.sp_values, type(fill_value)) taken = np.empty_like(indices, dtype=dtype) taken.fill(fill_value) return taken @@ -1009,7 +1009,7 @@ def _take_with_fill(self, indices, fill_value=None): if self.sp_index.npoints == 0: # Avoid taking from the empty self.sp_values taken = np.full(sp_indexer.shape, fill_value=fill_value, - dtype=np.result_type(fill_value)) + dtype=np.result_type(type(fill_value))) else: taken = self.sp_values.take(sp_indexer) @@ -1030,12 +1030,12 @@ def _take_with_fill(self, indices, fill_value=None): result_type = taken.dtype if m0.any(): - result_type = np.result_type(result_type, self.fill_value) + result_type = np.result_type(result_type, type(self.fill_value)) taken = taken.astype(result_type) taken[old_fill_indices] = self.fill_value if m1.any(): - result_type = np.result_type(result_type, fill_value) + result_type = np.result_type(result_type, type(fill_value)) taken = taken.astype(result_type) taken[new_fill_indices] = fill_value @@ -1061,7 +1061,7 @@ def _take_without_fill(self, indices): # edge case in take... # I think just return out = np.full(indices.shape, self.fill_value, - dtype=np.result_type(self.fill_value)) + dtype=np.result_type(type(self.fill_value))) arr, sp_index, fill_value = make_sparse(out, fill_value=self.fill_value) return type(self)(arr, sparse_index=sp_index, @@ -1073,7 +1073,7 @@ def _take_without_fill(self, indices): if fillable.any(): # TODO: may need to coerce array to fill value - result_type = np.result_type(taken, self.fill_value) + result_type = np.result_type(taken, type(self.fill_value)) taken = taken.astype(result_type) taken[fillable] = self.fill_value @@ -1215,10 +1215,26 @@ def astype(self, dtype=None, copy=True): dtype = pandas_dtype(dtype) if not isinstance(dtype, SparseDtype): - dtype = SparseDtype(dtype, fill_value=self.fill_value) + fill_value = astype_nansafe(np.array(self.fill_value), + dtype).item() + dtype = SparseDtype(dtype, fill_value=fill_value) + + # Typically we'll just astype the sp_values to dtype.subtype, + # but SparseDtype follows the pandas convention of storing strings + # as object dtype. So SparseDtype(str) immediately becomes + # SparseDtype(object), and at this point we don't know whether object + # means string or something else. We *cannot* just pass object to + # astype_nansafe below, since that won't convert to string. So + # we rely on the assumption that "string fill_value" means strings + # which is close enough to being true. + if (is_object_dtype(dtype.subtype) and + isinstance(dtype.fill_value, compat.text_type)): + subtype = str + else: + subtype = dtype.subtype sp_values = astype_nansafe(self.sp_values, - dtype.subtype, + subtype, copy=copy) if sp_values is self.sp_values and copy: sp_values = sp_values.copy() diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 852c4fb910560..53f2863b6d790 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -468,6 +468,32 @@ def test_astype_all(self, any_real_dtype): tm.assert_numpy_array_equal(np.asarray(res.values), vals.astype(typ)) + @pytest.mark.parametrize('array, dtype, expected', [ + (SparseArray([0, 1]), 'float', + SparseArray([0., 1.], dtype=SparseDtype(float, 0.0))), + (SparseArray([0, 1]), bool, SparseArray([False, True])), + (SparseArray([0, 1], fill_value=1), bool, + SparseArray([False, True], dtype=SparseDtype(bool, True))), + pytest.param( + SparseArray([0, 1]), 'datetime64[ns]', + SparseArray(np.array([0, 1], dtype='datetime64[ns]'), + dtype=SparseDtype('datetime64[ns]', + pd.Timestamp('1970'))), + marks=[pytest.mark.xfail(reason="NumPy-7619", strict=True)], + ), + (SparseArray([0, 1, 10]), str, + SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))), + (SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])), + ]) + def test_astype_more(self, array, dtype, expected): + result = array.astype(dtype) + tm.assert_sp_array_equal(result, expected) + + def test_astype_nan_raises(self): + arr = SparseArray([1.0, np.nan]) + with tm.assert_raises_regex(ValueError, 'Cannot convert non-finite'): + arr.astype(int) + def test_set_fill_value(self): arr = SparseArray([1., np.nan, 2.], fill_value=np.nan) arr.fill_value = 2 From 232921b86d6348a1618857e28609b9675b2cffa2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Nov 2018 11:12:01 -0600 Subject: [PATCH 2/8] object type, lint --- pandas/core/arrays/sparse.py | 3 ++- pandas/tests/arrays/sparse/test_array.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 8f35222ad3b56..d4936409bdb7f 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1030,7 +1030,8 @@ def _take_with_fill(self, indices, fill_value=None): result_type = taken.dtype if m0.any(): - result_type = np.result_type(result_type, type(self.fill_value)) + result_type = np.result_type(result_type, + type(self.fill_value)) taken = taken.astype(result_type) taken[old_fill_indices] = self.fill_value diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 53f2863b6d790..f8dcf2186e62b 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -477,13 +477,15 @@ def test_astype_all(self, any_real_dtype): pytest.param( SparseArray([0, 1]), 'datetime64[ns]', SparseArray(np.array([0, 1], dtype='datetime64[ns]'), - dtype=SparseDtype('datetime64[ns]', - pd.Timestamp('1970'))), + dtype=SparseDtype('datetime64[ns]', + pd.Timestamp('1970'))), marks=[pytest.mark.xfail(reason="NumPy-7619", strict=True)], ), (SparseArray([0, 1, 10]), str, SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))), (SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])), + (SparseArray([0, 1, 0]), object, + SparseArray([0, 1, 0], dtype=SparseDtype(object, 0))), ]) def test_astype_more(self, array, dtype, expected): result = array.astype(dtype) From 7454e31a904b886b209c2d835437c33651e36026 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Nov 2018 16:04:35 -0600 Subject: [PATCH 3/8] text --- pandas/core/arrays/sparse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index d4936409bdb7f..d69f51bf8837f 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1229,8 +1229,8 @@ def astype(self, dtype=None, copy=True): # we rely on the assumption that "string fill_value" means strings # which is close enough to being true. if (is_object_dtype(dtype.subtype) and - isinstance(dtype.fill_value, compat.text_type)): - subtype = str + isinstance(dtype.fill_value, compat.string_types)): + subtype = compat.text_type else: subtype = dtype.subtype From 1cc43d63c83787870a3f91dbb353f478f8e69849 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 11 Nov 2018 11:45:21 -0600 Subject: [PATCH 4/8] Moved to astype --- pandas/core/arrays/sparse.py | 100 ++++++++++++++++++----- pandas/tests/arrays/sparse/test_dtype.py | 20 +++++ 2 files changed, 99 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index d69f51bf8837f..3500144f979ad 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -284,6 +284,83 @@ def is_dtype(cls, dtype): return True return isinstance(dtype, np.dtype) or dtype == 'Sparse' + def astype(self, dtype): + """Convert the SparseDtype to a new dtype. + + This takes care of converting the ``fill_value``. + + Parameters + ---------- + dtype : Union[str, numpy.dtype, SparseDtype] + The new dtype to use. + + * For a SparseDtype, it is simply returned + * For a NumPy dtype (or str), the current fill value + is converted to the new dtype, and a SparseDtype + with `dtype` and the new fill value is returned. + + Returns + ------- + SparseDtype + A new SparseDtype with the corret `dtype` and fill value + for that `dtype`. + + Raises + ------ + ValueError + When the current fill value cannot be converted to the + new `dtype` (e.g. trying to convert ``np.nan`` to an + integer dtype. + + + Examples + -------- + >>> SparseDtype(int, 0).astype(float) + Sparse[float64, 0.0] + + >>> SparseDtype(int, 1).astype(SparseDtype(float, np.nan)) + Sparse[float64, nan] + """ + cls = type(self) + dtype = pandas_dtype(dtype) + + if not isinstance(dtype, cls): + fill_value = astype_nansafe(np.array(self.fill_value), + dtype).item() + dtype = cls(dtype, fill_value=fill_value) + + return dtype + + @property + def _subtype_with_str(self): + """ + Whether the SparseDtype's subtype should be considered ``str``. + + Typically, pandas will store string data in an object-dtype array. + When converting values to a dtype, e.g. in ``.astype``, we need to + be more specific, we need the actual underlying type. + + Returns + ------- + + >>> SparseDtype(int, 1)._subtype_with_str + dtype('int64') + + >>> SparseDtype(object, 1)._subtype_with_str + dtype('O') + + >>> dtype = SparseDtype(str, '') + >>> dtype.subtype + dtype('O') + + >>> dtype._subtype_with_str + str + """ + if isinstance(self.fill_value, compat.string_types): + return type(self.fill_value) + return self.subtype + + # ---------------------------------------------------------------------------- # Array @@ -1213,27 +1290,8 @@ def astype(self, dtype=None, copy=True): IntIndex Indices: array([2, 3], dtype=int32) """ - dtype = pandas_dtype(dtype) - - if not isinstance(dtype, SparseDtype): - fill_value = astype_nansafe(np.array(self.fill_value), - dtype).item() - dtype = SparseDtype(dtype, fill_value=fill_value) - - # Typically we'll just astype the sp_values to dtype.subtype, - # but SparseDtype follows the pandas convention of storing strings - # as object dtype. So SparseDtype(str) immediately becomes - # SparseDtype(object), and at this point we don't know whether object - # means string or something else. We *cannot* just pass object to - # astype_nansafe below, since that won't convert to string. So - # we rely on the assumption that "string fill_value" means strings - # which is close enough to being true. - if (is_object_dtype(dtype.subtype) and - isinstance(dtype.fill_value, compat.string_types)): - subtype = compat.text_type - else: - subtype = dtype.subtype - + dtype = self.dtype.astype(dtype) + subtype = dtype._subtype_with_str sp_values = astype_nansafe(self.sp_values, subtype, copy=copy) diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 7c310693cf26c..d834129652f8c 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -139,3 +139,23 @@ def test_parse_subtype(string, expected): def test_construct_from_string_fill_value_raises(string): with pytest.raises(TypeError, match='fill_value in the string is not'): SparseDtype.construct_from_string(string) + + +@pytest.mark.parametrize('original, dtype, expected', [ + (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), + (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), + (SparseDtype(int, 1), str, SparseDtype(object, '1')), + (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), +]) +def test_astype(original, dtype, expected): + result = original.astype(dtype) + assert result == expected + + +@pytest.mark.parametrize("original, dtype", [ + (SparseDtype(float, np.nan), int), + (SparseDtype(str, 'abc'), int), +]) +def test_astype_raises(original, dtype): + with pytest.raises(ValueError): + original.astype(dtype) From 57d32ae3da2d19317615a3c0a390a9546ad95149 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 11 Nov 2018 11:49:19 -0600 Subject: [PATCH 5/8] closing paren --- pandas/core/arrays/sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 3500144f979ad..36396992b3663 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -310,7 +310,7 @@ def astype(self, dtype): ValueError When the current fill value cannot be converted to the new `dtype` (e.g. trying to convert ``np.nan`` to an - integer dtype. + integer dtype). Examples From d93d98f0a5a264452f36701b4e7578d225e6b60e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 11 Nov 2018 14:41:04 -0600 Subject: [PATCH 6/8] astype -> update_dtype --- pandas/core/arrays/sparse.py | 8 ++++---- pandas/tests/arrays/sparse/test_dtype.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 36396992b3663..4148cb1f448a3 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -284,7 +284,7 @@ def is_dtype(cls, dtype): return True return isinstance(dtype, np.dtype) or dtype == 'Sparse' - def astype(self, dtype): + def update_dtype(self, dtype): """Convert the SparseDtype to a new dtype. This takes care of converting the ``fill_value``. @@ -315,10 +315,10 @@ def astype(self, dtype): Examples -------- - >>> SparseDtype(int, 0).astype(float) + >>> SparseDtype(int, 0).update_dtype(float) Sparse[float64, 0.0] - >>> SparseDtype(int, 1).astype(SparseDtype(float, np.nan)) + >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) Sparse[float64, nan] """ cls = type(self) @@ -1290,7 +1290,7 @@ def astype(self, dtype=None, copy=True): IntIndex Indices: array([2, 3], dtype=int32) """ - dtype = self.dtype.astype(dtype) + dtype = self.dtype.update_dtype(dtype) subtype = dtype._subtype_with_str sp_values = astype_nansafe(self.sp_values, subtype, diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index d834129652f8c..2d386de0d31a3 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -147,8 +147,8 @@ def test_construct_from_string_fill_value_raises(string): (SparseDtype(int, 1), str, SparseDtype(object, '1')), (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), ]) -def test_astype(original, dtype, expected): - result = original.astype(dtype) +def test_update_dtype(original, dtype, expected): + result = original.update_dtype(dtype) assert result == expected @@ -156,6 +156,6 @@ def test_astype(original, dtype, expected): (SparseDtype(float, np.nan), int), (SparseDtype(str, 'abc'), int), ]) -def test_astype_raises(original, dtype): +def test_update_dtype_raises(original, dtype): with pytest.raises(ValueError): - original.astype(dtype) + original.update_dtype(dtype) From 4f4b3a3fe9646bf64f77eb05757a2da35b896a3a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 11 Nov 2018 15:02:36 -0600 Subject: [PATCH 7/8] pytest.raises --- pandas/tests/arrays/sparse/test_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index d7e4807c8e816..0e5a8280cc467 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -502,7 +502,7 @@ def test_astype_more(self, array, dtype, expected): def test_astype_nan_raises(self): arr = SparseArray([1.0, np.nan]) - with tm.assert_raises_regex(ValueError, 'Cannot convert non-finite'): + with pytest.raises(ValueError, match='Cannot convert non-finite'): arr.astype(int) def test_set_fill_value(self): From 3dfc07e0161d9d0a4745ede5ab60e1485522bb22 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Nov 2018 05:34:31 -0600 Subject: [PATCH 8/8] handle nan --- pandas/core/arrays/sparse.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 4148cb1f448a3..672261c2a407e 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1171,7 +1171,9 @@ def _concat_same_type(cls, to_concat): fill_value = fill_values[0] - if len(set(fill_values)) > 1: + # np.nan isn't a singleton, so we may end up with multiple + # NaNs here, so we ignore tha all NA case too. + if not (len(set(fill_values)) == 1 or isna(fill_values).all()): warnings.warn("Concatenating sparse arrays with multiple fill " "values: '{}'. Picking the first and " "converting the rest.".format(fill_values),