Skip to content

BUG: Fix wrong SparseBlock initialization #17386

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 106 additions & 13 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
is_bool_dtype,
is_object_dtype,
is_datetimelike_v_numeric,
is_complex_dtype,
is_float_dtype, is_numeric_dtype,
is_numeric_v_string_like, is_extension_type,
is_list_like,
Expand Down Expand Up @@ -454,8 +455,11 @@ def make_a_block(nv, ref_loc):
nv = _block_shape(nv, ndim=self.ndim)
except (AttributeError, NotImplementedError):
pass

block = self.make_block(values=nv,
placement=ref_loc, fastpath=True)
placement=ref_loc,
fastpath=True)

return block

# ndim == 1
Expand Down Expand Up @@ -1020,7 +1024,7 @@ def f(m, v, i):

return [self.make_block(new_values, fastpath=True)]

def coerce_to_target_dtype(self, other):
def coerce_to_target_dtype(self, other, copy=False):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, I dont think this actually helps, why did you add it?

Copy link
Contributor Author

@Licht-T Licht-T Sep 15, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SparseBlock astype(dtype, copy=False) makes reinterpret cast, so I override coerce_to_target_dtype and set copy=True in SparseBlock class.
203a8f9#diff-e705e723b2d6e7c0e2a0443f80916abfR2639

"""
coerce the current block to a dtype compat for other
we will return a block, possibly object, and not raise
Expand All @@ -1037,7 +1041,7 @@ def coerce_to_target_dtype(self, other):

if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype):
# we don't upcast to bool
return self.astype(object)
return self.astype(object, copy=copy)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this is ok, though numpy ignores the copy= flag when dtype is object so no point in passing it if dtype is object


elif ((self.is_float or self.is_complex) and
(is_integer_dtype(dtype) or is_float_dtype(dtype))):
Expand All @@ -1051,14 +1055,14 @@ def coerce_to_target_dtype(self, other):
# not a datetime
if not ((is_datetime64_dtype(dtype) or
is_datetime64tz_dtype(dtype)) and self.is_datetime):
return self.astype(object)
return self.astype(object, copy=copy)

# don't upcast timezone with different timezone or no timezone
mytz = getattr(self.dtype, 'tz', None)
othertz = getattr(dtype, 'tz', None)

if str(mytz) != str(othertz):
return self.astype(object)
return self.astype(object, copy=copy)

raise AssertionError("possible recursion in "
"coerce_to_target_dtype: {} {}".format(
Expand All @@ -1068,18 +1072,18 @@ def coerce_to_target_dtype(self, other):

# not a timedelta
if not (is_timedelta64_dtype(dtype) and self.is_timedelta):
return self.astype(object)
return self.astype(object, copy=copy)

raise AssertionError("possible recursion in "
"coerce_to_target_dtype: {} {}".format(
self, other))

try:
return self.astype(dtype)
return self.astype(dtype, copy=copy)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here i guess is ok

except (ValueError, TypeError):
pass

return self.astype(object)
return self.astype(object, copy=copy)

def interpolate(self, method='pad', axis=0, index=None, values=None,
inplace=False, limit=None, limit_direction='forward',
Expand Down Expand Up @@ -1440,6 +1444,11 @@ def where(self, other, cond, align=True, errors='raise',
if hasattr(other, 'reindex_axis'):
other = other.values

if is_scalar(other) or is_list_like(other):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

huh?

fill_value = other
else:
fill_value = None

if hasattr(cond, 'reindex_axis'):
cond = cond.values

Expand All @@ -1452,6 +1461,9 @@ def where(self, other, cond, align=True, errors='raise',
if not hasattr(cond, 'shape'):
raise ValueError("where must have a condition that is ndarray "
"like")
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

huh? if you find yourself adding an if/then pretty much anywhere then you are doing it wrong. better to add the method to Sparse and call super; sometimes the super method may need a bit of refactor to make it more general.

if self.is_sparse:
cond = cond.flatten()

# our where function
def func(cond, values, other):
Expand Down Expand Up @@ -1489,7 +1501,7 @@ def func(cond, values, other):
transpose=transpose)
return self._maybe_downcast(blocks, 'infer')

if self._can_hold_na or self.ndim == 1:
if self._can_hold_element(fill_value) or values.ndim == 1:

if transpose:
result = result.T
Expand All @@ -1498,7 +1510,12 @@ def func(cond, values, other):
if try_cast:
result = self._try_cast_result(result)

return self.make_block(result)
if isinstance(result, np.ndarray):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again this is just completely confusing to do and makes the code way more complex. find a better way

ndim = result.ndim
else:
ndim = None

return self.make_block(result, ndim=ndim, fill_value=fill_value)

# might need to separate out blocks
axis = cond.ndim - 1
Expand All @@ -1512,7 +1529,8 @@ def func(cond, values, other):
r = self._try_cast_result(result.take(m.nonzero()[0],
axis=axis))
result_blocks.append(
self.make_block(r.T, placement=self.mgr_locs[m]))
self.make_block_same_class(r.T,
placement=self.mgr_locs[m]))

return result_blocks

Expand Down Expand Up @@ -1832,6 +1850,7 @@ class FloatBlock(FloatOrComplexBlock):
is_float = True
_downcast_dtype = 'int64'

@classmethod
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is such a huge change, what is the purpose?

def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
Expand Down Expand Up @@ -1881,6 +1900,7 @@ class ComplexBlock(FloatOrComplexBlock):
__slots__ = ()
is_complex = True

@classmethod
def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
Expand Down Expand Up @@ -2042,6 +2062,7 @@ class BoolBlock(NumericBlock):
is_bool = True
_can_hold_na = False

@classmethod
def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
Expand Down Expand Up @@ -2751,11 +2772,63 @@ class SparseBlock(NonConsolidatableMixIn, Block):
is_sparse = True
is_numeric = True
_box_to_block_values = False
_can_hold_na = True
_ftype = 'sparse'
_holder = SparseArray
_concatenator = staticmethod(_concat._concat_sparse)

def __init__(self, values, placement, ndim=None, fastpath=False, **kwargs):
super(SparseBlock, self).__init__(values, placement,
ndim, fastpath,
**kwargs)

dtype = self.values.sp_values.dtype

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is pretty complex, pls simplify

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added these codes because coerce_to_target_dtype does not work well in SparseBlock. This is only checking the type information of SparseArray and these procedure is also implemented in IntBlock, etc. I cannot figure out how to simplify. Any suggestion for simplifying?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not against this theorectially, but the implementation is fragile here. there are functions for validation already in the sparse classes.

if is_float_dtype(dtype):
self.is_float = True
self._can_hold_na = True
elif is_complex_dtype(dtype):
self.is_complex = True
self._can_hold_na = True
elif is_integer_dtype(dtype):
self.is_integer = True
self._can_hold_na = False
elif is_bool_dtype(dtype):
self.is_bool = True
self._can_hold_na = False
elif is_object_dtype(dtype):
self.is_object = True
self._can_hold_na = True
else:
self._can_hold_na = False

def _can_hold_element(self, element):
""" require the same dtype as ourselves """
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again this is so complex and adds so much techincal debt.

dtype = self.values.sp_values.dtype

if is_bool_dtype(dtype):
return BoolBlock._can_hold_element(element)
elif is_integer_dtype(dtype):
if is_list_like(element):
element = np.array(element)
tipo = element.dtype.type
return (issubclass(tipo, np.integer) and
not issubclass(tipo,
(np.datetime64,
np.timedelta64)) and
dtype.itemsize >= element.dtype.itemsize)
return is_integer(element)
elif is_float_dtype(dtype):
return FloatBlock._can_hold_element(element)
elif is_complex_dtype(dtype):
return ComplexBlock._can_hold_element(element)
elif is_object_dtype(dtype):
return True
else:
return False

def coerce_to_target_dtype(self, other, copy=True):
return super(SparseBlock, self).coerce_to_target_dtype(other, copy)

@property
def shape(self):
return (len(self.mgr_locs), self.sp_index.length)
Expand Down Expand Up @@ -2816,6 +2889,20 @@ def copy(self, deep=True, mgr=None):
kind=self.kind, copy=deep,
placement=self.mgr_locs)

def make_block(self, values, placement=None,
ndim=None, fill_value=None, **kwargs):
"""
Create a new block, with type inference propagate any values that are
not specified
"""
if fill_value is not None and isinstance(values, SparseArray):
values = SparseArray(values.to_dense(), fill_value=fill_value,
kind=values.kind, dtype=values.dtype)

return super(SparseBlock, self).make_block(values, placement=placement,
ndim=ndim, fill_value=None,
**kwargs)

def make_block_same_class(self, values, placement, sparse_index=None,
kind=None, dtype=None, fill_value=None,
copy=False, fastpath=True, **kwargs):
Expand Down Expand Up @@ -2912,9 +2999,15 @@ def sparse_reindex(self, new_index):
return self.make_block_same_class(values, sparse_index=new_index,
placement=self.mgr_locs)

def _try_coerce_result(self, result):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this is the right idea, though you shoul dgenerally have a function that intercepts a ndarray and creates a SparseArray; it should be called for most sparse methods.

""" reverse of try_coerce_args """
if isinstance(result, np.ndarray):
result = SparseArray(result.flatten(), kind=self.kind)
return result


def make_block(values, placement, klass=None, ndim=None, dtype=None,
fastpath=False):
fastpath=False, **kwargs):
if klass is None:
dtype = dtype or values.dtype
vtype = dtype.type
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def _simple_new(cls, data, sp_index, fill_value):
sp_index.ngaps > 0):
# if float fill_value is being included in dense repr,
# convert values to float
data = data.astype(float)
data = data.astype(float, copy=True)

result = data.view(cls)

Expand Down
15 changes: 13 additions & 2 deletions pandas/core/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,8 +321,9 @@ def _apply_columns(self, func):
data=new_data, index=self.index, columns=self.columns,
default_fill_value=self.default_fill_value).__finalize__(self)

def astype(self, dtype):
return self._apply_columns(lambda x: x.astype(dtype))
def astype(self, dtype, copy=True, errors='raise', **kwargs):
return self._apply_columns(lambda x: x.astype(dtype, copy,
errors, **kwargs))

def copy(self, deep=True):
"""
Expand All @@ -333,6 +334,16 @@ def copy(self, deep=True):
result._default_kind = self._default_kind
return result

def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
try_cast=False, raise_on_error=True):
result = super(SparseDataFrame,
self).where(cond, other,
inplace, axis,
level, try_cast,
raise_on_error=raise_on_error)
result._default_fill_value = other
return result

@property
def default_fill_value(self):
return self._default_fill_value
Expand Down
12 changes: 0 additions & 12 deletions pandas/tests/sparse/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1410,8 +1410,6 @@ def test_numpy_func_call(self):
[nan, nan]
]
])
@pytest.mark.xfail(reason='Wrong SparseBlock initialization '
'(GH 17386)')
def test_where_with_numeric_data(self, data):
# GH 17386
lower_bound = 1.5
Expand Down Expand Up @@ -1443,8 +1441,6 @@ def test_where_with_numeric_data(self, data):
0.1,
100.0 + 100.0j
])
@pytest.mark.xfail(reason='Wrong SparseBlock initialization '
'(GH 17386)')
def test_where_with_numeric_data_and_other(self, data, other):
# GH 17386
lower_bound = 1.5
Expand All @@ -1460,8 +1456,6 @@ def test_where_with_numeric_data_and_other(self, data, other):
tm.assert_frame_equal(result, dense_expected)
tm.assert_sp_frame_equal(result, sparse_expected)

@pytest.mark.xfail(reason='Wrong SparseBlock initialization '
'(GH 17386)')
def test_where_with_bool_data(self):
# GH 17386
data = [[False, False], [True, True], [False, False]]
Expand All @@ -1483,8 +1477,6 @@ def test_where_with_bool_data(self):
0.1,
100.0 + 100.0j
])
@pytest.mark.xfail(reason='Wrong SparseBlock initialization '
'(GH 17386)')
def test_where_with_bool_data_and_other(self, other):
# GH 17386
data = [[False, False], [True, True], [False, False]]
Expand All @@ -1501,8 +1493,6 @@ def test_where_with_bool_data_and_other(self, other):
tm.assert_frame_equal(result, dense_expected)
tm.assert_sp_frame_equal(result, sparse_expected)

@pytest.mark.xfail(reason='Wrong SparseBlock initialization '
'(GH 17386)')
def test_quantile(self):
# GH 17386
data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
Expand All @@ -1518,8 +1508,6 @@ def test_quantile(self):
tm.assert_series_equal(result, dense_expected)
tm.assert_sp_series_equal(result, sparse_expected)

@pytest.mark.xfail(reason='Wrong SparseBlock initialization '
'(GH 17386)')
def test_quantile_multi(self):
# GH 17386
data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
Expand Down
10 changes: 0 additions & 10 deletions pandas/tests/sparse/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1430,8 +1430,6 @@ def test_deprecated_reindex_axis(self):
nan, nan
]
])
@pytest.mark.xfail(reason='Wrong SparseBlock initialization '
'(GH 17386)')
def test_where_with_numeric_data(self, data):
# GH 17386
lower_bound = 1.5
Expand Down Expand Up @@ -1463,9 +1461,6 @@ def test_where_with_numeric_data(self, data):
0.1,
100.0 + 100.0j
])
@pytest.mark.skip(reason='Wrong SparseBlock initialization '
'(Segfault) '
'(GH 17386)')
def test_where_with_numeric_data_and_other(self, data, other):
# GH 17386
lower_bound = 1.5
Expand All @@ -1480,8 +1475,6 @@ def test_where_with_numeric_data_and_other(self, data, other):
tm.assert_series_equal(result, dense_expected)
tm.assert_sp_series_equal(result, sparse_expected)

@pytest.mark.xfail(reason='Wrong SparseBlock initialization '
'(GH 17386)')
def test_where_with_bool_data(self):
# GH 17386
data = [False, False, True, True, False, False]
Expand All @@ -1503,9 +1496,6 @@ def test_where_with_bool_data(self):
0.1,
100.0 + 100.0j
])
@pytest.mark.skip(reason='Wrong SparseBlock initialization '
'(Segfault) '
'(GH 17386)')
def test_where_with_bool_data_and_other(self, other):
# GH 17386
data = [False, False, True, True, False, False]
Expand Down