diff --git a/RELEASE.rst b/RELEASE.rst index 6a35db03a5307..b1c18daf20a3b 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -79,7 +79,7 @@ pandas 0.11.0 doesn't have nans, then an int will be returned) - backfill/pad/take/diff/ohlc will now support ``float32/int16/int8`` operations - - Integer block types will upcast as needed in where operations (GH2793_) + - Block types will upcast as needed in where/masking operations (GH2793_) - Series now automatically will try to set the correct dtype based on passed datetimelike objects (datetime/Timestamp) diff --git a/pandas/core/common.py b/pandas/core/common.py index a3e8c09839891..20c6ae05349ec 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -221,6 +221,13 @@ def mask_missing(arr, values_to_mask): for x in nonna: if mask is None: mask = arr == x + + # if x is a string and mask is not, then we get a scalar + # return value, which is not good + if not isinstance(mask,np.ndarray): + m = mask + mask = np.empty(arr.shape,dtype=np.bool) + mask.fill(m) else: mask = mask | (arr == x) @@ -730,6 +737,11 @@ def _maybe_promote(dtype, fill_value=np.nan): dtype = np.complex128 else: dtype = np.object_ + + # in case we have a string that looked like a number + if issubclass(np.dtype(dtype).type, basestring): + dtype = np.object_ + return dtype, fill_value diff --git a/pandas/core/internals.py b/pandas/core/internals.py index d2c3f4104950b..0228baf238bcd 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4,7 +4,7 @@ from numpy import nan import numpy as np -from pandas.core.common import _possibly_downcast_to_dtype +from pandas.core.common import _possibly_downcast_to_dtype, isnull from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices import pandas.core.common as com @@ -260,32 +260,14 @@ def _try_cast_result(self, result): return result def replace(self, to_replace, value, inplace=False): - new_values = self.values if inplace else self.values.copy() - if self._can_hold_element(value): - value = self._try_cast(value) - - if not isinstance(to_replace, (list, np.ndarray)): - if self._can_hold_element(to_replace): - to_replace = self._try_cast(to_replace) - msk = com.mask_missing(new_values, to_replace) - np.putmask(new_values, msk, value) - else: - try: - to_replace = np.array(to_replace, dtype=self.dtype) - msk = com.mask_missing(new_values, to_replace) - np.putmask(new_values, msk, value) - except Exception: - to_replace = np.array(to_replace, dtype=object) - for r in to_replace: - if self._can_hold_element(r): - r = self._try_cast(r) - msk = com.mask_missing(new_values, to_replace) - np.putmask(new_values, msk, value) - - if inplace: - return self - else: - return make_block(new_values, self.items, self.ref_items) + """ replace the to_replace value with value, possible to create new blocks here + this is just a call to putmask """ + mask = com.mask_missing(self.values, to_replace) + if not mask.any(): + if inplace: + return [ self ] + return [ self.copy() ] + return self.putmask(mask, value, inplace=inplace) def putmask(self, mask, new, inplace=False): """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -309,19 +291,34 @@ def putmask(self, mask, new, inplace=False): # maybe upcast me elif mask.any(): - # type of the new block - if ((isinstance(new, np.ndarray) and issubclass(new.dtype, np.number)) or - isinstance(new, float)): - typ = np.float64 - else: - typ = np.object_ - # we need to exiplicty astype here to make a copy - new_values = new_values.astype(typ) + # need to go column by column + new_blocks = [] + for i, item in enumerate(self.items): - # we create a new block type - np.putmask(new_values, mask, new) - return [ make_block(new_values, self.items, self.ref_items) ] + m = mask[i] + + # need a new block + if m.any(): + + n = new[i] if isinstance(new, np.ndarray) else new + + # type of the new block + dtype, _ = com._maybe_promote(np.array(n).dtype) + + # we need to exiplicty astype here to make a copy + nv = new_values[i].astype(dtype) + + # we create a new block type + np.putmask(nv, m, n) + + else: + nv = new_values[i] if inplace else new_values[i].copy() + + nv = _block_shape(nv) + new_blocks.append(make_block(nv, [ item ], self.ref_items)) + + return new_blocks if inplace: return [ self ] @@ -350,7 +347,7 @@ def interpolate(self, method='pad', axis=0, inplace=False, if missing is None: mask = None else: # todo create faster fill func without masking - mask = _mask_missing(transf(values), missing) + mask = com.mask_missing(transf(values), missing) if method == 'pad': com.pad_2d(transf(values), limit=limit, mask=mask) @@ -532,7 +529,7 @@ def create_block(result, items, transpose = True): if len(result) == 1: result = np.repeat(result,self.shape[1:]) - result = result.reshape(((1,) + self.shape[1:])) + result = _block_shape(result,ndim=self.ndim,shape=self.shape[1:]) result_blocks.append(create_block(result, item, transpose = False)) return result_blocks @@ -540,23 +537,6 @@ def create_block(result, items, transpose = True): result = func(cond,values,other) return create_block(result, self.items) -def _mask_missing(array, missing_values): - if not isinstance(missing_values, (list, np.ndarray)): - missing_values = [missing_values] - - mask = None - missing_values = np.array(missing_values, dtype=object) - if com.isnull(missing_values).any(): - mask = com.isnull(array) - missing_values = missing_values[com.notnull(missing_values)] - - for v in missing_values: - if mask is None: - mask = array == missing_values - else: - mask |= array == missing_values - return mask - class NumericBlock(Block): is_numeric = True _can_hold_na = True @@ -659,7 +639,7 @@ def convert(self, convert_dates = True, convert_numeric = True, copy = True): values = self.get(c) values = com._possibly_convert_objects(values, convert_dates=convert_dates, convert_numeric=convert_numeric) - values = values.reshape(((1,) + values.shape)) + values = _block_shape(values) items = self.items.take([i]) newb = make_block(values, items, self.ref_items) blocks.append(newb) @@ -949,23 +929,37 @@ def replace(self, *args, **kwargs): def replace_list(self, src_lst, dest_lst, inplace=False): """ do a list replace """ - if not inplace: - self = self.copy() - - sset = set(src_lst) - if any([k in sset for k in dest_lst]): - masks = {} - for s in src_lst: - masks[s] = [b.values == s for b in self.blocks] - - for s, d in zip(src_lst, dest_lst): - [b.putmask(masks[s][i], d, inplace=True) for i, b in - enumerate(self.blocks)] - else: - for s, d in zip(src_lst, dest_lst): - self.replace(s, d, inplace=True) - return self + # figure out our mask a-priori to avoid repeated replacements + values = self.as_matrix() + def comp(s): + if isnull(s): + return isnull(values) + return values == s + masks = [ comp(s) for i, s in enumerate(src_lst) ] + + result_blocks = [] + for blk in self.blocks: + + # its possible to get multiple result blocks here + # replace ALWAYS will return a list + rb = [ blk if inplace else blk.copy() ] + for i, d in enumerate(dest_lst): + new_rb = [] + for b in rb: + # get our mask for this element, sized to this + # particular block + m = masks[i][b.ref_locs] + if m.any(): + new_rb.extend(b.putmask(m, d, inplace=True)) + else: + new_rb.append(b) + rb = new_rb + result_blocks.extend(rb) + + bm = self.__class__(result_blocks, self.axes) + bm._consolidate_inplace() + return bm def is_consolidated(self): """ @@ -1302,8 +1296,7 @@ def set(self, item, value): Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items """ - if value.ndim == self.ndim - 1: - value = value.reshape((1,) + value.shape) + value = _block_shape(value,self.ndim-1) if value.shape[1:] != self.shape[1:]: raise AssertionError('Shape of new values must be compatible ' 'with manager shape') @@ -1873,6 +1866,14 @@ def _merge_blocks(blocks, items): return new_block.reindex_items_from(items) +def _block_shape(values, ndim=1, shape=None): + """ guarantee the shape of the values to be at least 1 d """ + if values.ndim == ndim: + if shape is None: + shape = values.shape + values = values.reshape(tuple((1,) + shape)) + return values + def _vstack(to_stack): if all(x.dtype == _NS_DTYPE for x in to_stack): # work around NumPy 1.6 bug diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 6db1db76d664a..2cdb4488b8126 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -5596,6 +5596,31 @@ def test_replace_mixed(self): assert_frame_equal(result, expected) assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame) + # int block upcasting + df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64') }) + expected = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0.5,1],dtype='float64') }) + result = df.replace(0, 0.5) + assert_frame_equal(result,expected) + + df.replace(0, 0.5, inplace=True) + assert_frame_equal(df,expected) + + # int block splitting + df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64'), 'C' : Series([1,2],dtype='int64') }) + expected = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0.5,1],dtype='float64'), 'C' : Series([1,2],dtype='int64') }) + result = df.replace(0, 0.5) + assert_frame_equal(result,expected) + + # to object block upcasting + df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64') }) + expected = DataFrame({ 'A' : Series([1,'foo'],dtype='object'), 'B' : Series([0,1],dtype='int64') }) + result = df.replace(2, 'foo') + assert_frame_equal(result,expected) + + expected = DataFrame({ 'A' : Series(['foo','bar'],dtype='object'), 'B' : Series([0,'foo'],dtype='object') }) + result = df.replace([1,2], ['foo','bar']) + assert_frame_equal(result,expected) + def test_replace_interpolate(self): padded = self.tsframe.replace(nan, method='pad') assert_frame_equal(padded, self.tsframe.fillna(method='pad'))