From f89a38061d2d11d12997ef5abbed1023b29cc489 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Thu, 31 Aug 2017 08:34:24 +0900 Subject: [PATCH 1/3] BUG: Fix wrong SparseBlock initialization in where method BUG: Fix wrong SparseBlock initialization in quantile method BUG: Fix make_spase mask generation not to cast when dtype is object BUG: Add SparseArray.all method BUG: Add copy parameter to prevent reinterpret cast of sparse Revert and fix astype parameters BUG: Create SparseBlock.__init__ to set type information of SparseArray BUG: Override SparseBlock._can_hold_element Revert changes in Block.whare BUG: Override SparseBlock.make_block with fill_value argument BUG: Set fill_value and ndim parameter in make_block when generating SparseBlock from result BUG: Override SparseBlock._try_coerce_result to make result flatten and sparse BUG: Change form _can_hold_na to _can_hold_element for supporting non NA fill value BUG: Fix 1D check statement SparseDataFrame.where passes (1, n)-shape SparseBlock, but actual values is n-length SparseArray BUG: Adjust cond shape to SparseBlock SparseDataFrame.where passes (1, n)-shape SparseBlock and condition block to Block.where, but it compares n-length SparseArray held by the SparseBlock and (1, n)-shape condition block. BUG: Override SparseDataFrame.where method to set _default_fill_value --- pandas/core/internals.py | 119 ++++++++++++++++++++++++++++++++---- pandas/core/sparse/array.py | 2 +- pandas/core/sparse/frame.py | 14 ++++- 3 files changed, 119 insertions(+), 16 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 045580d393b26..7bae661ba93dd 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -29,6 +29,7 @@ is_bool_dtype, is_object_dtype, is_datetimelike_v_numeric, + is_complex_dtype, is_float_dtype, is_numeric_dtype, is_numeric_v_string_like, is_extension_type, is_list_like, @@ -454,8 +455,11 @@ def make_a_block(nv, ref_loc): nv = _block_shape(nv, ndim=self.ndim) except (AttributeError, NotImplementedError): pass + block = self.make_block(values=nv, - placement=ref_loc, fastpath=True) + placement=ref_loc, + fastpath=True) + return block # ndim == 1 @@ -1020,7 +1024,7 @@ def f(m, v, i): return [self.make_block(new_values, fastpath=True)] - def coerce_to_target_dtype(self, other): + def coerce_to_target_dtype(self, other, copy=False): """ coerce the current block to a dtype compat for other we will return a block, possibly object, and not raise @@ -1037,7 +1041,7 @@ def coerce_to_target_dtype(self, other): if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype): # we don't upcast to bool - return self.astype(object) + return self.astype(object, copy=copy) elif ((self.is_float or self.is_complex) and (is_integer_dtype(dtype) or is_float_dtype(dtype))): @@ -1051,14 +1055,14 @@ def coerce_to_target_dtype(self, other): # not a datetime if not ((is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)) and self.is_datetime): - return self.astype(object) + return self.astype(object, copy=copy) # don't upcast timezone with different timezone or no timezone mytz = getattr(self.dtype, 'tz', None) othertz = getattr(dtype, 'tz', None) if str(mytz) != str(othertz): - return self.astype(object) + return self.astype(object, copy=copy) raise AssertionError("possible recursion in " "coerce_to_target_dtype: {} {}".format( @@ -1068,18 +1072,18 @@ def coerce_to_target_dtype(self, other): # not a timedelta if not (is_timedelta64_dtype(dtype) and self.is_timedelta): - return self.astype(object) + return self.astype(object, copy=copy) raise AssertionError("possible recursion in " "coerce_to_target_dtype: {} {}".format( self, other)) try: - return self.astype(dtype) + return self.astype(dtype, copy=copy) except (ValueError, TypeError): pass - return self.astype(object) + return self.astype(object, copy=copy) def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, limit_direction='forward', @@ -1440,6 +1444,11 @@ def where(self, other, cond, align=True, errors='raise', if hasattr(other, 'reindex_axis'): other = other.values + if is_scalar(other) or is_list_like(other): + fill_value = other + else: + fill_value = None + if hasattr(cond, 'reindex_axis'): cond = cond.values @@ -1452,6 +1461,9 @@ def where(self, other, cond, align=True, errors='raise', if not hasattr(cond, 'shape'): raise ValueError("where must have a condition that is ndarray " "like") + else: + if self.is_sparse: + cond = cond.flatten() # our where function def func(cond, values, other): @@ -1489,7 +1501,7 @@ def func(cond, values, other): transpose=transpose) return self._maybe_downcast(blocks, 'infer') - if self._can_hold_na or self.ndim == 1: + if self._can_hold_element(fill_value) or values.ndim == 1: if transpose: result = result.T @@ -1498,7 +1510,12 @@ def func(cond, values, other): if try_cast: result = self._try_cast_result(result) - return self.make_block(result) + if isinstance(result, np.ndarray): + ndim = result.ndim + else: + ndim = None + + return self.make_block(result, ndim=ndim, fill_value=fill_value) # might need to separate out blocks axis = cond.ndim - 1 @@ -1512,7 +1529,8 @@ def func(cond, values, other): r = self._try_cast_result(result.take(m.nonzero()[0], axis=axis)) result_blocks.append( - self.make_block(r.T, placement=self.mgr_locs[m])) + self.make_block_same_class(r.T, + placement=self.mgr_locs[m])) return result_blocks @@ -1832,6 +1850,7 @@ class FloatBlock(FloatOrComplexBlock): is_float = True _downcast_dtype = 'int64' + @classmethod def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: @@ -1881,6 +1900,7 @@ class ComplexBlock(FloatOrComplexBlock): __slots__ = () is_complex = True + @classmethod def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: @@ -2042,6 +2062,7 @@ class BoolBlock(NumericBlock): is_bool = True _can_hold_na = False + @classmethod def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: @@ -2751,11 +2772,63 @@ class SparseBlock(NonConsolidatableMixIn, Block): is_sparse = True is_numeric = True _box_to_block_values = False - _can_hold_na = True _ftype = 'sparse' _holder = SparseArray _concatenator = staticmethod(_concat._concat_sparse) + def __init__(self, values, placement, ndim=None, fastpath=False, **kwargs): + super(SparseBlock, self).__init__(values, placement, + ndim, fastpath, + **kwargs) + + dtype = self.values.sp_values.dtype + + if is_float_dtype(dtype): + self.is_float = True + self._can_hold_na = True + elif is_complex_dtype(dtype): + self.is_complex = True + self._can_hold_na = True + elif is_integer_dtype(dtype): + self.is_integer = True + self._can_hold_na = False + elif is_bool_dtype(dtype): + self.is_bool = True + self._can_hold_na = False + elif is_object_dtype(dtype): + self.is_object = True + self._can_hold_na = True + else: + self._can_hold_na = False + + def _can_hold_element(self, element): + """ require the same dtype as ourselves """ + dtype = self.values.sp_values.dtype + + if is_bool_dtype(dtype): + return BoolBlock._can_hold_element(element) + elif is_integer_dtype(dtype): + if is_list_like(element): + element = np.array(element) + tipo = element.dtype.type + return (issubclass(tipo, np.integer) and + not issubclass(tipo, + (np.datetime64, + np.timedelta64)) and + dtype.itemsize >= element.dtype.itemsize) + return is_integer(element) + elif is_float_dtype(dtype): + return FloatBlock._can_hold_element(element) + elif is_complex_dtype(dtype): + return ComplexBlock._can_hold_element(element) + elif is_object_dtype(dtype): + return True + else: + return False + + def coerce_to_target_dtype(self, other, copy=True): + return super(SparseBlock, self).coerce_to_target_dtype(other, copy) + @property def shape(self): return (len(self.mgr_locs), self.sp_index.length) @@ -2816,6 +2889,20 @@ def copy(self, deep=True, mgr=None): kind=self.kind, copy=deep, placement=self.mgr_locs) + def make_block(self, values, placement=None, + ndim=None, fill_value=None, **kwargs): + """ + Create a new block, with type inference propagate any values that are + not specified + """ + if fill_value is not None and isinstance(values, SparseArray): + values = SparseArray(values.to_dense(), fill_value=fill_value, + kind=values.kind, dtype=values.dtype) + + return super(SparseBlock, self).make_block(values, placement=placement, + ndim=ndim, fill_value=None, + **kwargs) + def make_block_same_class(self, values, placement, sparse_index=None, kind=None, dtype=None, fill_value=None, copy=False, fastpath=True, **kwargs): @@ -2912,9 +2999,15 @@ def sparse_reindex(self, new_index): return self.make_block_same_class(values, sparse_index=new_index, placement=self.mgr_locs) + def _try_coerce_result(self, result): + """ reverse of try_coerce_args """ + if isinstance(result, np.ndarray): + result = SparseArray(result.flatten(), kind=self.kind) + return result + def make_block(values, placement, klass=None, ndim=None, dtype=None, - fastpath=False): + fastpath=False, **kwargs): if klass is None: dtype = dtype or values.dtype vtype = dtype.type diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 0424ac8703e25..699618b11448d 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -248,7 +248,7 @@ def _simple_new(cls, data, sp_index, fill_value): sp_index.ngaps > 0): # if float fill_value is being included in dense repr, # convert values to float - data = data.astype(float) + data = data.astype(float, copy=True) result = data.view(cls) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 1b45b180b8dc1..c5cba667d865c 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -321,8 +321,9 @@ def _apply_columns(self, func): data=new_data, index=self.index, columns=self.columns, default_fill_value=self.default_fill_value).__finalize__(self) - def astype(self, dtype): - return self._apply_columns(lambda x: x.astype(dtype)) + def astype(self, dtype, copy=True, errors='raise', **kwargs): + return self._apply_columns(lambda x: x.astype(dtype, copy, + errors, **kwargs)) def copy(self, deep=True): """ @@ -333,6 +334,15 @@ def copy(self, deep=True): result._default_kind = self._default_kind return result + def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, + try_cast=False, raise_on_error=True): + result = super(SparseDataFrame, self).where(cond, other, + inplace, axis, + level, try_cast, + raise_on_error) + result._default_fill_value = other + return result + @property def default_fill_value(self): return self._default_fill_value From 1cbb4a86426cda438740a0ebe7775d078eb29884 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sun, 12 Nov 2017 18:32:00 +0900 Subject: [PATCH 2/3] BUG: Fix wrong argument in Sparse.where --- pandas/core/sparse/frame.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index c5cba667d865c..19b33c3d6df8c 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -336,10 +336,11 @@ def copy(self, deep=True): def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, try_cast=False, raise_on_error=True): - result = super(SparseDataFrame, self).where(cond, other, - inplace, axis, - level, try_cast, - raise_on_error) + result = super(SparseDataFrame, + self).where(cond, other, + inplace, axis, + level, try_cast, + raise_on_error=raise_on_error) result._default_fill_value = other return result From 6b36d55a34cd3338c6bd64a43c4708007f8901ba Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sun, 12 Nov 2017 18:33:00 +0900 Subject: [PATCH 3/3] TST: Remove xfail/skip marks from Sparse.where tests --- pandas/tests/sparse/test_frame.py | 12 ------------ pandas/tests/sparse/test_series.py | 10 ---------- 2 files changed, 22 deletions(-) diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index e65059156c5b9..b8bc0530294a5 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -1410,8 +1410,6 @@ def test_numpy_func_call(self): [nan, nan] ] ]) - @pytest.mark.xfail(reason='Wrong SparseBlock initialization ' - '(GH 17386)') def test_where_with_numeric_data(self, data): # GH 17386 lower_bound = 1.5 @@ -1443,8 +1441,6 @@ def test_where_with_numeric_data(self, data): 0.1, 100.0 + 100.0j ]) - @pytest.mark.xfail(reason='Wrong SparseBlock initialization ' - '(GH 17386)') def test_where_with_numeric_data_and_other(self, data, other): # GH 17386 lower_bound = 1.5 @@ -1460,8 +1456,6 @@ def test_where_with_numeric_data_and_other(self, data, other): tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected) - @pytest.mark.xfail(reason='Wrong SparseBlock initialization ' - '(GH 17386)') def test_where_with_bool_data(self): # GH 17386 data = [[False, False], [True, True], [False, False]] @@ -1483,8 +1477,6 @@ def test_where_with_bool_data(self): 0.1, 100.0 + 100.0j ]) - @pytest.mark.xfail(reason='Wrong SparseBlock initialization ' - '(GH 17386)') def test_where_with_bool_data_and_other(self, other): # GH 17386 data = [[False, False], [True, True], [False, False]] @@ -1501,8 +1493,6 @@ def test_where_with_bool_data_and_other(self, other): tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected) - @pytest.mark.xfail(reason='Wrong SparseBlock initialization ' - '(GH 17386)') def test_quantile(self): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [nan, nan]] @@ -1518,8 +1508,6 @@ def test_quantile(self): tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected) - @pytest.mark.xfail(reason='Wrong SparseBlock initialization ' - '(GH 17386)') def test_quantile_multi(self): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [nan, nan]] diff --git a/pandas/tests/sparse/test_series.py b/pandas/tests/sparse/test_series.py index 1dc1c7f1575cc..4014826847611 100644 --- a/pandas/tests/sparse/test_series.py +++ b/pandas/tests/sparse/test_series.py @@ -1430,8 +1430,6 @@ def test_deprecated_reindex_axis(self): nan, nan ] ]) - @pytest.mark.xfail(reason='Wrong SparseBlock initialization ' - '(GH 17386)') def test_where_with_numeric_data(self, data): # GH 17386 lower_bound = 1.5 @@ -1463,9 +1461,6 @@ def test_where_with_numeric_data(self, data): 0.1, 100.0 + 100.0j ]) - @pytest.mark.skip(reason='Wrong SparseBlock initialization ' - '(Segfault) ' - '(GH 17386)') def test_where_with_numeric_data_and_other(self, data, other): # GH 17386 lower_bound = 1.5 @@ -1480,8 +1475,6 @@ def test_where_with_numeric_data_and_other(self, data, other): tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected) - @pytest.mark.xfail(reason='Wrong SparseBlock initialization ' - '(GH 17386)') def test_where_with_bool_data(self): # GH 17386 data = [False, False, True, True, False, False] @@ -1503,9 +1496,6 @@ def test_where_with_bool_data(self): 0.1, 100.0 + 100.0j ]) - @pytest.mark.skip(reason='Wrong SparseBlock initialization ' - '(Segfault) ' - '(GH 17386)') def test_where_with_bool_data_and_other(self, other): # GH 17386 data = [False, False, True, True, False, False]