From f89a38061d2d11d12997ef5abbed1023b29cc489 Mon Sep 17 00:00:00 2001
From: Licht-T <licht-t@outlook.jp>
Date: Thu, 31 Aug 2017 08:34:24 +0900
Subject: [PATCH 1/3] BUG: Fix wrong SparseBlock initialization in where method

BUG: Fix wrong SparseBlock initialization in quantile method

BUG: Fix make_spase mask generation not to cast when dtype is object

BUG: Add SparseArray.all method

BUG: Add copy parameter to prevent reinterpret cast of sparse

Revert and fix astype parameters

BUG: Create SparseBlock.__init__ to set type information of SparseArray

BUG: Override SparseBlock._can_hold_element

Revert changes in Block.whare

BUG: Override SparseBlock.make_block with fill_value argument

BUG: Set fill_value and ndim parameter in make_block when generating SparseBlock from result

BUG: Override SparseBlock._try_coerce_result to make result flatten and sparse

BUG: Change form _can_hold_na to _can_hold_element for supporting non NA fill value

BUG: Fix 1D check statement
SparseDataFrame.where passes (1, n)-shape SparseBlock, but actual values is n-length SparseArray

BUG: Adjust cond shape to SparseBlock
SparseDataFrame.where passes (1, n)-shape SparseBlock and condition block to Block.where,
but it compares n-length SparseArray held by the SparseBlock and (1, n)-shape condition block.

BUG: Override SparseDataFrame.where method to set _default_fill_value
---
 pandas/core/internals.py    | 119 ++++++++++++++++++++++++++++++++----
 pandas/core/sparse/array.py |   2 +-
 pandas/core/sparse/frame.py |  14 ++++-
 3 files changed, 119 insertions(+), 16 deletions(-)

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 045580d393b26..7bae661ba93dd 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -29,6 +29,7 @@
     is_bool_dtype,
     is_object_dtype,
     is_datetimelike_v_numeric,
+    is_complex_dtype,
     is_float_dtype, is_numeric_dtype,
     is_numeric_v_string_like, is_extension_type,
     is_list_like,
@@ -454,8 +455,11 @@ def make_a_block(nv, ref_loc):
                     nv = _block_shape(nv, ndim=self.ndim)
                 except (AttributeError, NotImplementedError):
                     pass
+
                 block = self.make_block(values=nv,
-                                        placement=ref_loc, fastpath=True)
+                                        placement=ref_loc,
+                                        fastpath=True)
+
             return block
 
         # ndim == 1
@@ -1020,7 +1024,7 @@ def f(m, v, i):
 
         return [self.make_block(new_values, fastpath=True)]
 
-    def coerce_to_target_dtype(self, other):
+    def coerce_to_target_dtype(self, other, copy=False):
         """
         coerce the current block to a dtype compat for other
         we will return a block, possibly object, and not raise
@@ -1037,7 +1041,7 @@ def coerce_to_target_dtype(self, other):
 
         if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype):
             # we don't upcast to bool
-            return self.astype(object)
+            return self.astype(object, copy=copy)
 
         elif ((self.is_float or self.is_complex) and
               (is_integer_dtype(dtype) or is_float_dtype(dtype))):
@@ -1051,14 +1055,14 @@ def coerce_to_target_dtype(self, other):
             # not a datetime
             if not ((is_datetime64_dtype(dtype) or
                      is_datetime64tz_dtype(dtype)) and self.is_datetime):
-                return self.astype(object)
+                return self.astype(object, copy=copy)
 
             # don't upcast timezone with different timezone or no timezone
             mytz = getattr(self.dtype, 'tz', None)
             othertz = getattr(dtype, 'tz', None)
 
             if str(mytz) != str(othertz):
-                return self.astype(object)
+                return self.astype(object, copy=copy)
 
             raise AssertionError("possible recursion in "
                                  "coerce_to_target_dtype: {} {}".format(
@@ -1068,18 +1072,18 @@ def coerce_to_target_dtype(self, other):
 
             # not a timedelta
             if not (is_timedelta64_dtype(dtype) and self.is_timedelta):
-                return self.astype(object)
+                return self.astype(object, copy=copy)
 
             raise AssertionError("possible recursion in "
                                  "coerce_to_target_dtype: {} {}".format(
                                      self, other))
 
         try:
-            return self.astype(dtype)
+            return self.astype(dtype, copy=copy)
         except (ValueError, TypeError):
             pass
 
-        return self.astype(object)
+        return self.astype(object, copy=copy)
 
     def interpolate(self, method='pad', axis=0, index=None, values=None,
                     inplace=False, limit=None, limit_direction='forward',
@@ -1440,6 +1444,11 @@ def where(self, other, cond, align=True, errors='raise',
         if hasattr(other, 'reindex_axis'):
             other = other.values
 
+        if is_scalar(other) or is_list_like(other):
+            fill_value = other
+        else:
+            fill_value = None
+
         if hasattr(cond, 'reindex_axis'):
             cond = cond.values
 
@@ -1452,6 +1461,9 @@ def where(self, other, cond, align=True, errors='raise',
         if not hasattr(cond, 'shape'):
             raise ValueError("where must have a condition that is ndarray "
                              "like")
+        else:
+            if self.is_sparse:
+                cond = cond.flatten()
 
         # our where function
         def func(cond, values, other):
@@ -1489,7 +1501,7 @@ def func(cond, values, other):
                                  transpose=transpose)
             return self._maybe_downcast(blocks, 'infer')
 
-        if self._can_hold_na or self.ndim == 1:
+        if self._can_hold_element(fill_value) or values.ndim == 1:
 
             if transpose:
                 result = result.T
@@ -1498,7 +1510,12 @@ def func(cond, values, other):
             if try_cast:
                 result = self._try_cast_result(result)
 
-            return self.make_block(result)
+            if isinstance(result, np.ndarray):
+                ndim = result.ndim
+            else:
+                ndim = None
+
+            return self.make_block(result, ndim=ndim, fill_value=fill_value)
 
         # might need to separate out blocks
         axis = cond.ndim - 1
@@ -1512,7 +1529,8 @@ def func(cond, values, other):
                 r = self._try_cast_result(result.take(m.nonzero()[0],
                                                       axis=axis))
                 result_blocks.append(
-                    self.make_block(r.T, placement=self.mgr_locs[m]))
+                    self.make_block_same_class(r.T,
+                                               placement=self.mgr_locs[m]))
 
         return result_blocks
 
@@ -1832,6 +1850,7 @@ class FloatBlock(FloatOrComplexBlock):
     is_float = True
     _downcast_dtype = 'int64'
 
+    @classmethod
     def _can_hold_element(self, element):
         tipo = maybe_infer_dtype_type(element)
         if tipo is not None:
@@ -1881,6 +1900,7 @@ class ComplexBlock(FloatOrComplexBlock):
     __slots__ = ()
     is_complex = True
 
+    @classmethod
     def _can_hold_element(self, element):
         tipo = maybe_infer_dtype_type(element)
         if tipo is not None:
@@ -2042,6 +2062,7 @@ class BoolBlock(NumericBlock):
     is_bool = True
     _can_hold_na = False
 
+    @classmethod
     def _can_hold_element(self, element):
         tipo = maybe_infer_dtype_type(element)
         if tipo is not None:
@@ -2751,11 +2772,63 @@ class SparseBlock(NonConsolidatableMixIn, Block):
     is_sparse = True
     is_numeric = True
     _box_to_block_values = False
-    _can_hold_na = True
     _ftype = 'sparse'
     _holder = SparseArray
     _concatenator = staticmethod(_concat._concat_sparse)
 
+    def __init__(self, values, placement, ndim=None, fastpath=False, **kwargs):
+        super(SparseBlock, self).__init__(values, placement,
+                                          ndim, fastpath,
+                                          **kwargs)
+
+        dtype = self.values.sp_values.dtype
+
+        if is_float_dtype(dtype):
+            self.is_float = True
+            self._can_hold_na = True
+        elif is_complex_dtype(dtype):
+            self.is_complex = True
+            self._can_hold_na = True
+        elif is_integer_dtype(dtype):
+            self.is_integer = True
+            self._can_hold_na = False
+        elif is_bool_dtype(dtype):
+            self.is_bool = True
+            self._can_hold_na = False
+        elif is_object_dtype(dtype):
+            self.is_object = True
+            self._can_hold_na = True
+        else:
+            self._can_hold_na = False
+
+    def _can_hold_element(self, element):
+        """ require the same dtype as ourselves """
+        dtype = self.values.sp_values.dtype
+
+        if is_bool_dtype(dtype):
+            return BoolBlock._can_hold_element(element)
+        elif is_integer_dtype(dtype):
+            if is_list_like(element):
+                element = np.array(element)
+                tipo = element.dtype.type
+                return (issubclass(tipo, np.integer) and
+                        not issubclass(tipo,
+                                       (np.datetime64,
+                                        np.timedelta64)) and
+                        dtype.itemsize >= element.dtype.itemsize)
+            return is_integer(element)
+        elif is_float_dtype(dtype):
+            return FloatBlock._can_hold_element(element)
+        elif is_complex_dtype(dtype):
+            return ComplexBlock._can_hold_element(element)
+        elif is_object_dtype(dtype):
+            return True
+        else:
+            return False
+
+    def coerce_to_target_dtype(self, other, copy=True):
+        return super(SparseBlock, self).coerce_to_target_dtype(other, copy)
+
     @property
     def shape(self):
         return (len(self.mgr_locs), self.sp_index.length)
@@ -2816,6 +2889,20 @@ def copy(self, deep=True, mgr=None):
                                           kind=self.kind, copy=deep,
                                           placement=self.mgr_locs)
 
+    def make_block(self, values, placement=None,
+                   ndim=None, fill_value=None, **kwargs):
+        """
+        Create a new block, with type inference propagate any values that are
+        not specified
+        """
+        if fill_value is not None and isinstance(values, SparseArray):
+            values = SparseArray(values.to_dense(), fill_value=fill_value,
+                                 kind=values.kind, dtype=values.dtype)
+
+        return super(SparseBlock, self).make_block(values, placement=placement,
+                                                   ndim=ndim, fill_value=None,
+                                                   **kwargs)
+
     def make_block_same_class(self, values, placement, sparse_index=None,
                               kind=None, dtype=None, fill_value=None,
                               copy=False, fastpath=True, **kwargs):
@@ -2912,9 +2999,15 @@ def sparse_reindex(self, new_index):
         return self.make_block_same_class(values, sparse_index=new_index,
                                           placement=self.mgr_locs)
 
+    def _try_coerce_result(self, result):
+        """ reverse of try_coerce_args """
+        if isinstance(result, np.ndarray):
+            result = SparseArray(result.flatten(), kind=self.kind)
+        return result
+
 
 def make_block(values, placement, klass=None, ndim=None, dtype=None,
-               fastpath=False):
+               fastpath=False, **kwargs):
     if klass is None:
         dtype = dtype or values.dtype
         vtype = dtype.type
diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py
index 0424ac8703e25..699618b11448d 100644
--- a/pandas/core/sparse/array.py
+++ b/pandas/core/sparse/array.py
@@ -248,7 +248,7 @@ def _simple_new(cls, data, sp_index, fill_value):
                 sp_index.ngaps > 0):
             # if float fill_value is being included in dense repr,
             # convert values to float
-            data = data.astype(float)
+            data = data.astype(float, copy=True)
 
         result = data.view(cls)
 
diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
index 1b45b180b8dc1..c5cba667d865c 100644
--- a/pandas/core/sparse/frame.py
+++ b/pandas/core/sparse/frame.py
@@ -321,8 +321,9 @@ def _apply_columns(self, func):
             data=new_data, index=self.index, columns=self.columns,
             default_fill_value=self.default_fill_value).__finalize__(self)
 
-    def astype(self, dtype):
-        return self._apply_columns(lambda x: x.astype(dtype))
+    def astype(self, dtype, copy=True, errors='raise', **kwargs):
+        return self._apply_columns(lambda x: x.astype(dtype, copy,
+                                                      errors, **kwargs))
 
     def copy(self, deep=True):
         """
@@ -333,6 +334,15 @@ def copy(self, deep=True):
         result._default_kind = self._default_kind
         return result
 
+    def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
+              try_cast=False, raise_on_error=True):
+        result = super(SparseDataFrame, self).where(cond, other,
+                                                    inplace, axis,
+                                                    level, try_cast,
+                                                    raise_on_error)
+        result._default_fill_value = other
+        return result
+
     @property
     def default_fill_value(self):
         return self._default_fill_value

From 1cbb4a86426cda438740a0ebe7775d078eb29884 Mon Sep 17 00:00:00 2001
From: Licht-T <licht-t@outlook.jp>
Date: Sun, 12 Nov 2017 18:32:00 +0900
Subject: [PATCH 2/3] BUG: Fix wrong argument in Sparse.where

---
 pandas/core/sparse/frame.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
index c5cba667d865c..19b33c3d6df8c 100644
--- a/pandas/core/sparse/frame.py
+++ b/pandas/core/sparse/frame.py
@@ -336,10 +336,11 @@ def copy(self, deep=True):
 
     def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
               try_cast=False, raise_on_error=True):
-        result = super(SparseDataFrame, self).where(cond, other,
-                                                    inplace, axis,
-                                                    level, try_cast,
-                                                    raise_on_error)
+        result = super(SparseDataFrame,
+                       self).where(cond, other,
+                                   inplace, axis,
+                                   level, try_cast,
+                                   raise_on_error=raise_on_error)
         result._default_fill_value = other
         return result
 

From 6b36d55a34cd3338c6bd64a43c4708007f8901ba Mon Sep 17 00:00:00 2001
From: Licht-T <licht-t@outlook.jp>
Date: Sun, 12 Nov 2017 18:33:00 +0900
Subject: [PATCH 3/3] TST: Remove xfail/skip marks from Sparse.where tests

---
 pandas/tests/sparse/test_frame.py  | 12 ------------
 pandas/tests/sparse/test_series.py | 10 ----------
 2 files changed, 22 deletions(-)

diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py
index e65059156c5b9..b8bc0530294a5 100644
--- a/pandas/tests/sparse/test_frame.py
+++ b/pandas/tests/sparse/test_frame.py
@@ -1410,8 +1410,6 @@ def test_numpy_func_call(self):
             [nan, nan]
         ]
     ])
-    @pytest.mark.xfail(reason='Wrong SparseBlock initialization '
-                              '(GH 17386)')
     def test_where_with_numeric_data(self, data):
         # GH 17386
         lower_bound = 1.5
@@ -1443,8 +1441,6 @@ def test_where_with_numeric_data(self, data):
         0.1,
         100.0 + 100.0j
     ])
-    @pytest.mark.xfail(reason='Wrong SparseBlock initialization '
-                              '(GH 17386)')
     def test_where_with_numeric_data_and_other(self, data, other):
         # GH 17386
         lower_bound = 1.5
@@ -1460,8 +1456,6 @@ def test_where_with_numeric_data_and_other(self, data, other):
         tm.assert_frame_equal(result, dense_expected)
         tm.assert_sp_frame_equal(result, sparse_expected)
 
-    @pytest.mark.xfail(reason='Wrong SparseBlock initialization '
-                              '(GH 17386)')
     def test_where_with_bool_data(self):
         # GH 17386
         data = [[False, False], [True, True], [False, False]]
@@ -1483,8 +1477,6 @@ def test_where_with_bool_data(self):
         0.1,
         100.0 + 100.0j
     ])
-    @pytest.mark.xfail(reason='Wrong SparseBlock initialization '
-                              '(GH 17386)')
     def test_where_with_bool_data_and_other(self, other):
         # GH 17386
         data = [[False, False], [True, True], [False, False]]
@@ -1501,8 +1493,6 @@ def test_where_with_bool_data_and_other(self, other):
         tm.assert_frame_equal(result, dense_expected)
         tm.assert_sp_frame_equal(result, sparse_expected)
 
-    @pytest.mark.xfail(reason='Wrong SparseBlock initialization '
-                              '(GH 17386)')
     def test_quantile(self):
         # GH 17386
         data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
@@ -1518,8 +1508,6 @@ def test_quantile(self):
         tm.assert_series_equal(result, dense_expected)
         tm.assert_sp_series_equal(result, sparse_expected)
 
-    @pytest.mark.xfail(reason='Wrong SparseBlock initialization '
-                              '(GH 17386)')
     def test_quantile_multi(self):
         # GH 17386
         data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
diff --git a/pandas/tests/sparse/test_series.py b/pandas/tests/sparse/test_series.py
index 1dc1c7f1575cc..4014826847611 100644
--- a/pandas/tests/sparse/test_series.py
+++ b/pandas/tests/sparse/test_series.py
@@ -1430,8 +1430,6 @@ def test_deprecated_reindex_axis(self):
             nan, nan
         ]
     ])
-    @pytest.mark.xfail(reason='Wrong SparseBlock initialization '
-                              '(GH 17386)')
     def test_where_with_numeric_data(self, data):
         # GH 17386
         lower_bound = 1.5
@@ -1463,9 +1461,6 @@ def test_where_with_numeric_data(self, data):
         0.1,
         100.0 + 100.0j
     ])
-    @pytest.mark.skip(reason='Wrong SparseBlock initialization '
-                             '(Segfault) '
-                             '(GH 17386)')
     def test_where_with_numeric_data_and_other(self, data, other):
         # GH 17386
         lower_bound = 1.5
@@ -1480,8 +1475,6 @@ def test_where_with_numeric_data_and_other(self, data, other):
         tm.assert_series_equal(result, dense_expected)
         tm.assert_sp_series_equal(result, sparse_expected)
 
-    @pytest.mark.xfail(reason='Wrong SparseBlock initialization '
-                              '(GH 17386)')
     def test_where_with_bool_data(self):
         # GH 17386
         data = [False, False, True, True, False, False]
@@ -1503,9 +1496,6 @@ def test_where_with_bool_data(self):
         0.1,
         100.0 + 100.0j
     ])
-    @pytest.mark.skip(reason='Wrong SparseBlock initialization '
-                             '(Segfault) '
-                             '(GH 17386)')
     def test_where_with_bool_data_and_other(self, other):
         # GH 17386
         data = [False, False, True, True, False, False]