Skip to content

BUG/ENH: guarantee blocks will upcast as needed, and split as needed #3065

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 16, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ pandas 0.11.0
doesn't have nans, then an int will be returned)
- backfill/pad/take/diff/ohlc will now support ``float32/int16/int8``
operations
- Integer block types will upcast as needed in where operations (GH2793_)
- Block types will upcast as needed in where/masking operations (GH2793_)
- Series now automatically will try to set the correct dtype based on passed
datetimelike objects (datetime/Timestamp)

Expand Down
12 changes: 12 additions & 0 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,13 @@ def mask_missing(arr, values_to_mask):
for x in nonna:
if mask is None:
mask = arr == x

# if x is a string and mask is not, then we get a scalar
# return value, which is not good
if not isinstance(mask,np.ndarray):
m = mask
mask = np.empty(arr.shape,dtype=np.bool)
mask.fill(m)
else:
mask = mask | (arr == x)

Expand Down Expand Up @@ -730,6 +737,11 @@ def _maybe_promote(dtype, fill_value=np.nan):
dtype = np.complex128
else:
dtype = np.object_

# in case we have a string that looked like a number
if issubclass(np.dtype(dtype).type, basestring):
dtype = np.object_

return dtype, fill_value


Expand Down
153 changes: 77 additions & 76 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from numpy import nan
import numpy as np

from pandas.core.common import _possibly_downcast_to_dtype
from pandas.core.common import _possibly_downcast_to_dtype, isnull
from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes
from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices
import pandas.core.common as com
Expand Down Expand Up @@ -260,32 +260,14 @@ def _try_cast_result(self, result):
return result

def replace(self, to_replace, value, inplace=False):
new_values = self.values if inplace else self.values.copy()
if self._can_hold_element(value):
value = self._try_cast(value)

if not isinstance(to_replace, (list, np.ndarray)):
if self._can_hold_element(to_replace):
to_replace = self._try_cast(to_replace)
msk = com.mask_missing(new_values, to_replace)
np.putmask(new_values, msk, value)
else:
try:
to_replace = np.array(to_replace, dtype=self.dtype)
msk = com.mask_missing(new_values, to_replace)
np.putmask(new_values, msk, value)
except Exception:
to_replace = np.array(to_replace, dtype=object)
for r in to_replace:
if self._can_hold_element(r):
r = self._try_cast(r)
msk = com.mask_missing(new_values, to_replace)
np.putmask(new_values, msk, value)

if inplace:
return self
else:
return make_block(new_values, self.items, self.ref_items)
""" replace the to_replace value with value, possible to create new blocks here
this is just a call to putmask """
mask = com.mask_missing(self.values, to_replace)
if not mask.any():
if inplace:
return [ self ]
return [ self.copy() ]
return self.putmask(mask, value, inplace=inplace)

def putmask(self, mask, new, inplace=False):
""" putmask the data to the block; it is possible that we may create a new dtype of block
Expand All @@ -309,19 +291,34 @@ def putmask(self, mask, new, inplace=False):

# maybe upcast me
elif mask.any():
# type of the new block
if ((isinstance(new, np.ndarray) and issubclass(new.dtype, np.number)) or
isinstance(new, float)):
typ = np.float64
else:
typ = np.object_

# we need to exiplicty astype here to make a copy
new_values = new_values.astype(typ)
# need to go column by column
new_blocks = []
for i, item in enumerate(self.items):

# we create a new block type
np.putmask(new_values, mask, new)
return [ make_block(new_values, self.items, self.ref_items) ]
m = mask[i]

# need a new block
if m.any():

n = new[i] if isinstance(new, np.ndarray) else new

# type of the new block
dtype, _ = com._maybe_promote(np.array(n).dtype)

# we need to exiplicty astype here to make a copy
nv = new_values[i].astype(dtype)

# we create a new block type
np.putmask(nv, m, n)

else:
nv = new_values[i] if inplace else new_values[i].copy()

nv = _block_shape(nv)
new_blocks.append(make_block(nv, [ item ], self.ref_items))

return new_blocks

if inplace:
return [ self ]
Expand Down Expand Up @@ -350,7 +347,7 @@ def interpolate(self, method='pad', axis=0, inplace=False,
if missing is None:
mask = None
else: # todo create faster fill func without masking
mask = _mask_missing(transf(values), missing)
mask = com.mask_missing(transf(values), missing)

if method == 'pad':
com.pad_2d(transf(values), limit=limit, mask=mask)
Expand Down Expand Up @@ -532,31 +529,14 @@ def create_block(result, items, transpose = True):
if len(result) == 1:
result = np.repeat(result,self.shape[1:])

result = result.reshape(((1,) + self.shape[1:]))
result = _block_shape(result,ndim=self.ndim,shape=self.shape[1:])
result_blocks.append(create_block(result, item, transpose = False))

return result_blocks
else:
result = func(cond,values,other)
return create_block(result, self.items)

def _mask_missing(array, missing_values):
if not isinstance(missing_values, (list, np.ndarray)):
missing_values = [missing_values]

mask = None
missing_values = np.array(missing_values, dtype=object)
if com.isnull(missing_values).any():
mask = com.isnull(array)
missing_values = missing_values[com.notnull(missing_values)]

for v in missing_values:
if mask is None:
mask = array == missing_values
else:
mask |= array == missing_values
return mask

class NumericBlock(Block):
is_numeric = True
_can_hold_na = True
Expand Down Expand Up @@ -659,7 +639,7 @@ def convert(self, convert_dates = True, convert_numeric = True, copy = True):
values = self.get(c)

values = com._possibly_convert_objects(values, convert_dates=convert_dates, convert_numeric=convert_numeric)
values = values.reshape(((1,) + values.shape))
values = _block_shape(values)
items = self.items.take([i])
newb = make_block(values, items, self.ref_items)
blocks.append(newb)
Expand Down Expand Up @@ -949,23 +929,37 @@ def replace(self, *args, **kwargs):

def replace_list(self, src_lst, dest_lst, inplace=False):
""" do a list replace """
if not inplace:
self = self.copy()

sset = set(src_lst)
if any([k in sset for k in dest_lst]):
masks = {}
for s in src_lst:
masks[s] = [b.values == s for b in self.blocks]

for s, d in zip(src_lst, dest_lst):
[b.putmask(masks[s][i], d, inplace=True) for i, b in
enumerate(self.blocks)]
else:
for s, d in zip(src_lst, dest_lst):
self.replace(s, d, inplace=True)

return self
# figure out our mask a-priori to avoid repeated replacements
values = self.as_matrix()
def comp(s):
if isnull(s):
return isnull(values)
return values == s
masks = [ comp(s) for i, s in enumerate(src_lst) ]

result_blocks = []
for blk in self.blocks:

# its possible to get multiple result blocks here
# replace ALWAYS will return a list
rb = [ blk if inplace else blk.copy() ]
for i, d in enumerate(dest_lst):
new_rb = []
for b in rb:
# get our mask for this element, sized to this
# particular block
m = masks[i][b.ref_locs]
if m.any():
new_rb.extend(b.putmask(m, d, inplace=True))
else:
new_rb.append(b)
rb = new_rb
result_blocks.extend(rb)

bm = self.__class__(result_blocks, self.axes)
bm._consolidate_inplace()
return bm

def is_consolidated(self):
"""
Expand Down Expand Up @@ -1302,8 +1296,7 @@ def set(self, item, value):
Set new item in-place. Does not consolidate. Adds new Block if not
contained in the current set of items
"""
if value.ndim == self.ndim - 1:
value = value.reshape((1,) + value.shape)
value = _block_shape(value,self.ndim-1)
if value.shape[1:] != self.shape[1:]:
raise AssertionError('Shape of new values must be compatible '
'with manager shape')
Expand Down Expand Up @@ -1873,6 +1866,14 @@ def _merge_blocks(blocks, items):
return new_block.reindex_items_from(items)


def _block_shape(values, ndim=1, shape=None):
""" guarantee the shape of the values to be at least 1 d """
if values.ndim == ndim:
if shape is None:
shape = values.shape
values = values.reshape(tuple((1,) + shape))
return values

def _vstack(to_stack):
if all(x.dtype == _NS_DTYPE for x in to_stack):
# work around NumPy 1.6 bug
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5596,6 +5596,31 @@ def test_replace_mixed(self):
assert_frame_equal(result, expected)
assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame)

# int block upcasting
df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64') })
expected = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0.5,1],dtype='float64') })
result = df.replace(0, 0.5)
assert_frame_equal(result,expected)

df.replace(0, 0.5, inplace=True)
assert_frame_equal(df,expected)

# int block splitting
df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64'), 'C' : Series([1,2],dtype='int64') })
expected = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0.5,1],dtype='float64'), 'C' : Series([1,2],dtype='int64') })
result = df.replace(0, 0.5)
assert_frame_equal(result,expected)

# to object block upcasting
df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64') })
expected = DataFrame({ 'A' : Series([1,'foo'],dtype='object'), 'B' : Series([0,1],dtype='int64') })
result = df.replace(2, 'foo')
assert_frame_equal(result,expected)

expected = DataFrame({ 'A' : Series(['foo','bar'],dtype='object'), 'B' : Series([0,'foo'],dtype='object') })
result = df.replace([1,2], ['foo','bar'])
assert_frame_equal(result,expected)

def test_replace_interpolate(self):
padded = self.tsframe.replace(nan, method='pad')
assert_frame_equal(padded, self.tsframe.fillna(method='pad'))
Expand Down