Skip to content

Commit fbfd16a

Browse files
committed
Merge pull request #3065 from jreback/replace
BUG/ENH: guarantee blocks will upcast as needed, and split as needed
2 parents dde093e + 9e09328 commit fbfd16a

File tree

4 files changed

+115
-77
lines changed

4 files changed

+115
-77
lines changed

RELEASE.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ pandas 0.11.0
7979
doesn't have nans, then an int will be returned)
8080
- backfill/pad/take/diff/ohlc will now support ``float32/int16/int8``
8181
operations
82-
- Integer block types will upcast as needed in where operations (GH2793_)
82+
- Block types will upcast as needed in where/masking operations (GH2793_)
8383
- Series now automatically will try to set the correct dtype based on passed
8484
datetimelike objects (datetime/Timestamp)
8585

pandas/core/common.py

+12
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,13 @@ def mask_missing(arr, values_to_mask):
221221
for x in nonna:
222222
if mask is None:
223223
mask = arr == x
224+
225+
# if x is a string and mask is not, then we get a scalar
226+
# return value, which is not good
227+
if not isinstance(mask,np.ndarray):
228+
m = mask
229+
mask = np.empty(arr.shape,dtype=np.bool)
230+
mask.fill(m)
224231
else:
225232
mask = mask | (arr == x)
226233

@@ -730,6 +737,11 @@ def _maybe_promote(dtype, fill_value=np.nan):
730737
dtype = np.complex128
731738
else:
732739
dtype = np.object_
740+
741+
# in case we have a string that looked like a number
742+
if issubclass(np.dtype(dtype).type, basestring):
743+
dtype = np.object_
744+
733745
return dtype, fill_value
734746

735747

pandas/core/internals.py

+77-76
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from numpy import nan
55
import numpy as np
66

7-
from pandas.core.common import _possibly_downcast_to_dtype
7+
from pandas.core.common import _possibly_downcast_to_dtype, isnull
88
from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes
99
from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices
1010
import pandas.core.common as com
@@ -260,32 +260,14 @@ def _try_cast_result(self, result):
260260
return result
261261

262262
def replace(self, to_replace, value, inplace=False):
263-
new_values = self.values if inplace else self.values.copy()
264-
if self._can_hold_element(value):
265-
value = self._try_cast(value)
266-
267-
if not isinstance(to_replace, (list, np.ndarray)):
268-
if self._can_hold_element(to_replace):
269-
to_replace = self._try_cast(to_replace)
270-
msk = com.mask_missing(new_values, to_replace)
271-
np.putmask(new_values, msk, value)
272-
else:
273-
try:
274-
to_replace = np.array(to_replace, dtype=self.dtype)
275-
msk = com.mask_missing(new_values, to_replace)
276-
np.putmask(new_values, msk, value)
277-
except Exception:
278-
to_replace = np.array(to_replace, dtype=object)
279-
for r in to_replace:
280-
if self._can_hold_element(r):
281-
r = self._try_cast(r)
282-
msk = com.mask_missing(new_values, to_replace)
283-
np.putmask(new_values, msk, value)
284-
285-
if inplace:
286-
return self
287-
else:
288-
return make_block(new_values, self.items, self.ref_items)
263+
""" replace the to_replace value with value, possible to create new blocks here
264+
this is just a call to putmask """
265+
mask = com.mask_missing(self.values, to_replace)
266+
if not mask.any():
267+
if inplace:
268+
return [ self ]
269+
return [ self.copy() ]
270+
return self.putmask(mask, value, inplace=inplace)
289271

290272
def putmask(self, mask, new, inplace=False):
291273
""" putmask the data to the block; it is possible that we may create a new dtype of block
@@ -309,19 +291,34 @@ def putmask(self, mask, new, inplace=False):
309291

310292
# maybe upcast me
311293
elif mask.any():
312-
# type of the new block
313-
if ((isinstance(new, np.ndarray) and issubclass(new.dtype, np.number)) or
314-
isinstance(new, float)):
315-
typ = np.float64
316-
else:
317-
typ = np.object_
318294

319-
# we need to exiplicty astype here to make a copy
320-
new_values = new_values.astype(typ)
295+
# need to go column by column
296+
new_blocks = []
297+
for i, item in enumerate(self.items):
321298

322-
# we create a new block type
323-
np.putmask(new_values, mask, new)
324-
return [ make_block(new_values, self.items, self.ref_items) ]
299+
m = mask[i]
300+
301+
# need a new block
302+
if m.any():
303+
304+
n = new[i] if isinstance(new, np.ndarray) else new
305+
306+
# type of the new block
307+
dtype, _ = com._maybe_promote(np.array(n).dtype)
308+
309+
# we need to exiplicty astype here to make a copy
310+
nv = new_values[i].astype(dtype)
311+
312+
# we create a new block type
313+
np.putmask(nv, m, n)
314+
315+
else:
316+
nv = new_values[i] if inplace else new_values[i].copy()
317+
318+
nv = _block_shape(nv)
319+
new_blocks.append(make_block(nv, [ item ], self.ref_items))
320+
321+
return new_blocks
325322

326323
if inplace:
327324
return [ self ]
@@ -350,7 +347,7 @@ def interpolate(self, method='pad', axis=0, inplace=False,
350347
if missing is None:
351348
mask = None
352349
else: # todo create faster fill func without masking
353-
mask = _mask_missing(transf(values), missing)
350+
mask = com.mask_missing(transf(values), missing)
354351

355352
if method == 'pad':
356353
com.pad_2d(transf(values), limit=limit, mask=mask)
@@ -532,31 +529,14 @@ def create_block(result, items, transpose = True):
532529
if len(result) == 1:
533530
result = np.repeat(result,self.shape[1:])
534531

535-
result = result.reshape(((1,) + self.shape[1:]))
532+
result = _block_shape(result,ndim=self.ndim,shape=self.shape[1:])
536533
result_blocks.append(create_block(result, item, transpose = False))
537534

538535
return result_blocks
539536
else:
540537
result = func(cond,values,other)
541538
return create_block(result, self.items)
542539

543-
def _mask_missing(array, missing_values):
544-
if not isinstance(missing_values, (list, np.ndarray)):
545-
missing_values = [missing_values]
546-
547-
mask = None
548-
missing_values = np.array(missing_values, dtype=object)
549-
if com.isnull(missing_values).any():
550-
mask = com.isnull(array)
551-
missing_values = missing_values[com.notnull(missing_values)]
552-
553-
for v in missing_values:
554-
if mask is None:
555-
mask = array == missing_values
556-
else:
557-
mask |= array == missing_values
558-
return mask
559-
560540
class NumericBlock(Block):
561541
is_numeric = True
562542
_can_hold_na = True
@@ -659,7 +639,7 @@ def convert(self, convert_dates = True, convert_numeric = True, copy = True):
659639
values = self.get(c)
660640

661641
values = com._possibly_convert_objects(values, convert_dates=convert_dates, convert_numeric=convert_numeric)
662-
values = values.reshape(((1,) + values.shape))
642+
values = _block_shape(values)
663643
items = self.items.take([i])
664644
newb = make_block(values, items, self.ref_items)
665645
blocks.append(newb)
@@ -949,23 +929,37 @@ def replace(self, *args, **kwargs):
949929

950930
def replace_list(self, src_lst, dest_lst, inplace=False):
951931
""" do a list replace """
952-
if not inplace:
953-
self = self.copy()
954-
955-
sset = set(src_lst)
956-
if any([k in sset for k in dest_lst]):
957-
masks = {}
958-
for s in src_lst:
959-
masks[s] = [b.values == s for b in self.blocks]
960-
961-
for s, d in zip(src_lst, dest_lst):
962-
[b.putmask(masks[s][i], d, inplace=True) for i, b in
963-
enumerate(self.blocks)]
964-
else:
965-
for s, d in zip(src_lst, dest_lst):
966-
self.replace(s, d, inplace=True)
967932

968-
return self
933+
# figure out our mask a-priori to avoid repeated replacements
934+
values = self.as_matrix()
935+
def comp(s):
936+
if isnull(s):
937+
return isnull(values)
938+
return values == s
939+
masks = [ comp(s) for i, s in enumerate(src_lst) ]
940+
941+
result_blocks = []
942+
for blk in self.blocks:
943+
944+
# its possible to get multiple result blocks here
945+
# replace ALWAYS will return a list
946+
rb = [ blk if inplace else blk.copy() ]
947+
for i, d in enumerate(dest_lst):
948+
new_rb = []
949+
for b in rb:
950+
# get our mask for this element, sized to this
951+
# particular block
952+
m = masks[i][b.ref_locs]
953+
if m.any():
954+
new_rb.extend(b.putmask(m, d, inplace=True))
955+
else:
956+
new_rb.append(b)
957+
rb = new_rb
958+
result_blocks.extend(rb)
959+
960+
bm = self.__class__(result_blocks, self.axes)
961+
bm._consolidate_inplace()
962+
return bm
969963

970964
def is_consolidated(self):
971965
"""
@@ -1302,8 +1296,7 @@ def set(self, item, value):
13021296
Set new item in-place. Does not consolidate. Adds new Block if not
13031297
contained in the current set of items
13041298
"""
1305-
if value.ndim == self.ndim - 1:
1306-
value = value.reshape((1,) + value.shape)
1299+
value = _block_shape(value,self.ndim-1)
13071300
if value.shape[1:] != self.shape[1:]:
13081301
raise AssertionError('Shape of new values must be compatible '
13091302
'with manager shape')
@@ -1873,6 +1866,14 @@ def _merge_blocks(blocks, items):
18731866
return new_block.reindex_items_from(items)
18741867

18751868

1869+
def _block_shape(values, ndim=1, shape=None):
1870+
""" guarantee the shape of the values to be at least 1 d """
1871+
if values.ndim == ndim:
1872+
if shape is None:
1873+
shape = values.shape
1874+
values = values.reshape(tuple((1,) + shape))
1875+
return values
1876+
18761877
def _vstack(to_stack):
18771878
if all(x.dtype == _NS_DTYPE for x in to_stack):
18781879
# work around NumPy 1.6 bug

pandas/tests/test_frame.py

+25
Original file line numberDiff line numberDiff line change
@@ -5596,6 +5596,31 @@ def test_replace_mixed(self):
55965596
assert_frame_equal(result, expected)
55975597
assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame)
55985598

5599+
# int block upcasting
5600+
df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64') })
5601+
expected = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0.5,1],dtype='float64') })
5602+
result = df.replace(0, 0.5)
5603+
assert_frame_equal(result,expected)
5604+
5605+
df.replace(0, 0.5, inplace=True)
5606+
assert_frame_equal(df,expected)
5607+
5608+
# int block splitting
5609+
df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64'), 'C' : Series([1,2],dtype='int64') })
5610+
expected = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0.5,1],dtype='float64'), 'C' : Series([1,2],dtype='int64') })
5611+
result = df.replace(0, 0.5)
5612+
assert_frame_equal(result,expected)
5613+
5614+
# to object block upcasting
5615+
df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64') })
5616+
expected = DataFrame({ 'A' : Series([1,'foo'],dtype='object'), 'B' : Series([0,1],dtype='int64') })
5617+
result = df.replace(2, 'foo')
5618+
assert_frame_equal(result,expected)
5619+
5620+
expected = DataFrame({ 'A' : Series(['foo','bar'],dtype='object'), 'B' : Series([0,'foo'],dtype='object') })
5621+
result = df.replace([1,2], ['foo','bar'])
5622+
assert_frame_equal(result,expected)
5623+
55995624
def test_replace_interpolate(self):
56005625
padded = self.tsframe.replace(nan, method='pad')
56015626
assert_frame_equal(padded, self.tsframe.fillna(method='pad'))

0 commit comments

Comments
 (0)