-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: Fix wrong SparseBlock initialization #17386
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,7 @@ | |
is_bool_dtype, | ||
is_object_dtype, | ||
is_datetimelike_v_numeric, | ||
is_complex_dtype, | ||
is_float_dtype, is_numeric_dtype, | ||
is_numeric_v_string_like, is_extension_type, | ||
is_list_like, | ||
|
@@ -454,8 +455,11 @@ def make_a_block(nv, ref_loc): | |
nv = _block_shape(nv, ndim=self.ndim) | ||
except (AttributeError, NotImplementedError): | ||
pass | ||
|
||
block = self.make_block(values=nv, | ||
placement=ref_loc, fastpath=True) | ||
placement=ref_loc, | ||
fastpath=True) | ||
|
||
return block | ||
|
||
# ndim == 1 | ||
|
@@ -1020,7 +1024,7 @@ def f(m, v, i): | |
|
||
return [self.make_block(new_values, fastpath=True)] | ||
|
||
def coerce_to_target_dtype(self, other): | ||
def coerce_to_target_dtype(self, other, copy=False): | ||
""" | ||
coerce the current block to a dtype compat for other | ||
we will return a block, possibly object, and not raise | ||
|
@@ -1037,7 +1041,7 @@ def coerce_to_target_dtype(self, other): | |
|
||
if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype): | ||
# we don't upcast to bool | ||
return self.astype(object) | ||
return self.astype(object, copy=copy) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess this is ok, though numpy ignores the |
||
|
||
elif ((self.is_float or self.is_complex) and | ||
(is_integer_dtype(dtype) or is_float_dtype(dtype))): | ||
|
@@ -1051,14 +1055,14 @@ def coerce_to_target_dtype(self, other): | |
# not a datetime | ||
if not ((is_datetime64_dtype(dtype) or | ||
is_datetime64tz_dtype(dtype)) and self.is_datetime): | ||
return self.astype(object) | ||
return self.astype(object, copy=copy) | ||
|
||
# don't upcast timezone with different timezone or no timezone | ||
mytz = getattr(self.dtype, 'tz', None) | ||
othertz = getattr(dtype, 'tz', None) | ||
|
||
if str(mytz) != str(othertz): | ||
return self.astype(object) | ||
return self.astype(object, copy=copy) | ||
|
||
raise AssertionError("possible recursion in " | ||
"coerce_to_target_dtype: {} {}".format( | ||
|
@@ -1068,18 +1072,18 @@ def coerce_to_target_dtype(self, other): | |
|
||
# not a timedelta | ||
if not (is_timedelta64_dtype(dtype) and self.is_timedelta): | ||
return self.astype(object) | ||
return self.astype(object, copy=copy) | ||
|
||
raise AssertionError("possible recursion in " | ||
"coerce_to_target_dtype: {} {}".format( | ||
self, other)) | ||
|
||
try: | ||
return self.astype(dtype) | ||
return self.astype(dtype, copy=copy) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here i guess is ok |
||
except (ValueError, TypeError): | ||
pass | ||
|
||
return self.astype(object) | ||
return self.astype(object, copy=copy) | ||
|
||
def interpolate(self, method='pad', axis=0, index=None, values=None, | ||
inplace=False, limit=None, limit_direction='forward', | ||
|
@@ -1440,6 +1444,11 @@ def where(self, other, cond, align=True, errors='raise', | |
if hasattr(other, 'reindex_axis'): | ||
other = other.values | ||
|
||
if is_scalar(other) or is_list_like(other): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. huh? |
||
fill_value = other | ||
else: | ||
fill_value = None | ||
|
||
if hasattr(cond, 'reindex_axis'): | ||
cond = cond.values | ||
|
||
|
@@ -1452,6 +1461,9 @@ def where(self, other, cond, align=True, errors='raise', | |
if not hasattr(cond, 'shape'): | ||
raise ValueError("where must have a condition that is ndarray " | ||
"like") | ||
else: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. huh? if you find yourself adding an if/then pretty much anywhere then you are doing it wrong. better to add the method to Sparse and call super; sometimes the super method may need a bit of refactor to make it more general. |
||
if self.is_sparse: | ||
cond = cond.flatten() | ||
|
||
# our where function | ||
def func(cond, values, other): | ||
|
@@ -1489,7 +1501,7 @@ def func(cond, values, other): | |
transpose=transpose) | ||
return self._maybe_downcast(blocks, 'infer') | ||
|
||
if self._can_hold_na or self.ndim == 1: | ||
if self._can_hold_element(fill_value) or values.ndim == 1: | ||
|
||
if transpose: | ||
result = result.T | ||
|
@@ -1498,7 +1510,12 @@ def func(cond, values, other): | |
if try_cast: | ||
result = self._try_cast_result(result) | ||
|
||
return self.make_block(result) | ||
if isinstance(result, np.ndarray): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again this is just completely confusing to do and makes the code way more complex. find a better way |
||
ndim = result.ndim | ||
else: | ||
ndim = None | ||
|
||
return self.make_block(result, ndim=ndim, fill_value=fill_value) | ||
|
||
# might need to separate out blocks | ||
axis = cond.ndim - 1 | ||
|
@@ -1512,7 +1529,8 @@ def func(cond, values, other): | |
r = self._try_cast_result(result.take(m.nonzero()[0], | ||
axis=axis)) | ||
result_blocks.append( | ||
self.make_block(r.T, placement=self.mgr_locs[m])) | ||
self.make_block_same_class(r.T, | ||
placement=self.mgr_locs[m])) | ||
|
||
return result_blocks | ||
|
||
|
@@ -1832,6 +1850,7 @@ class FloatBlock(FloatOrComplexBlock): | |
is_float = True | ||
_downcast_dtype = 'int64' | ||
|
||
@classmethod | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is such a huge change, what is the purpose? |
||
def _can_hold_element(self, element): | ||
tipo = maybe_infer_dtype_type(element) | ||
if tipo is not None: | ||
|
@@ -1881,6 +1900,7 @@ class ComplexBlock(FloatOrComplexBlock): | |
__slots__ = () | ||
is_complex = True | ||
|
||
@classmethod | ||
def _can_hold_element(self, element): | ||
tipo = maybe_infer_dtype_type(element) | ||
if tipo is not None: | ||
|
@@ -2042,6 +2062,7 @@ class BoolBlock(NumericBlock): | |
is_bool = True | ||
_can_hold_na = False | ||
|
||
@classmethod | ||
def _can_hold_element(self, element): | ||
tipo = maybe_infer_dtype_type(element) | ||
if tipo is not None: | ||
|
@@ -2751,11 +2772,63 @@ class SparseBlock(NonConsolidatableMixIn, Block): | |
is_sparse = True | ||
is_numeric = True | ||
_box_to_block_values = False | ||
_can_hold_na = True | ||
_ftype = 'sparse' | ||
_holder = SparseArray | ||
_concatenator = staticmethod(_concat._concat_sparse) | ||
|
||
def __init__(self, values, placement, ndim=None, fastpath=False, **kwargs): | ||
super(SparseBlock, self).__init__(values, placement, | ||
ndim, fastpath, | ||
**kwargs) | ||
|
||
dtype = self.values.sp_values.dtype | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is pretty complex, pls simplify There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added these codes because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not against this theorectially, but the implementation is fragile here. there are functions for validation already in the sparse classes. |
||
if is_float_dtype(dtype): | ||
self.is_float = True | ||
self._can_hold_na = True | ||
elif is_complex_dtype(dtype): | ||
self.is_complex = True | ||
self._can_hold_na = True | ||
elif is_integer_dtype(dtype): | ||
self.is_integer = True | ||
self._can_hold_na = False | ||
elif is_bool_dtype(dtype): | ||
self.is_bool = True | ||
self._can_hold_na = False | ||
elif is_object_dtype(dtype): | ||
self.is_object = True | ||
self._can_hold_na = True | ||
else: | ||
self._can_hold_na = False | ||
|
||
def _can_hold_element(self, element): | ||
""" require the same dtype as ourselves """ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again this is so complex and adds so much techincal debt. |
||
dtype = self.values.sp_values.dtype | ||
|
||
if is_bool_dtype(dtype): | ||
return BoolBlock._can_hold_element(element) | ||
elif is_integer_dtype(dtype): | ||
if is_list_like(element): | ||
element = np.array(element) | ||
tipo = element.dtype.type | ||
return (issubclass(tipo, np.integer) and | ||
not issubclass(tipo, | ||
(np.datetime64, | ||
np.timedelta64)) and | ||
dtype.itemsize >= element.dtype.itemsize) | ||
return is_integer(element) | ||
elif is_float_dtype(dtype): | ||
return FloatBlock._can_hold_element(element) | ||
elif is_complex_dtype(dtype): | ||
return ComplexBlock._can_hold_element(element) | ||
elif is_object_dtype(dtype): | ||
return True | ||
else: | ||
return False | ||
|
||
def coerce_to_target_dtype(self, other, copy=True): | ||
return super(SparseBlock, self).coerce_to_target_dtype(other, copy) | ||
|
||
@property | ||
def shape(self): | ||
return (len(self.mgr_locs), self.sp_index.length) | ||
|
@@ -2816,6 +2889,20 @@ def copy(self, deep=True, mgr=None): | |
kind=self.kind, copy=deep, | ||
placement=self.mgr_locs) | ||
|
||
def make_block(self, values, placement=None, | ||
ndim=None, fill_value=None, **kwargs): | ||
""" | ||
Create a new block, with type inference propagate any values that are | ||
not specified | ||
""" | ||
if fill_value is not None and isinstance(values, SparseArray): | ||
values = SparseArray(values.to_dense(), fill_value=fill_value, | ||
kind=values.kind, dtype=values.dtype) | ||
|
||
return super(SparseBlock, self).make_block(values, placement=placement, | ||
ndim=ndim, fill_value=None, | ||
**kwargs) | ||
|
||
def make_block_same_class(self, values, placement, sparse_index=None, | ||
kind=None, dtype=None, fill_value=None, | ||
copy=False, fastpath=True, **kwargs): | ||
|
@@ -2912,9 +2999,15 @@ def sparse_reindex(self, new_index): | |
return self.make_block_same_class(values, sparse_index=new_index, | ||
placement=self.mgr_locs) | ||
|
||
def _try_coerce_result(self, result): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so this is the right idea, though you shoul dgenerally have a function that intercepts a ndarray and creates a SparseArray; it should be called for most sparse methods. |
||
""" reverse of try_coerce_args """ | ||
if isinstance(result, np.ndarray): | ||
result = SparseArray(result.flatten(), kind=self.kind) | ||
return result | ||
|
||
|
||
def make_block(values, placement, klass=None, ndim=None, dtype=None, | ||
fastpath=False): | ||
fastpath=False, **kwargs): | ||
if klass is None: | ||
dtype = dtype or values.dtype | ||
vtype = dtype.type | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hmm, I dont think this actually helps, why did you add it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SparseBlock
astype(dtype, copy=False)
makes reinterpret cast, so I overridecoerce_to_target_dtype
and setcopy=True
inSparseBlock
class.203a8f9#diff-e705e723b2d6e7c0e2a0443f80916abfR2639