From 100f0b700eecd9927d2979db3e4ec04b0c726661 Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Sat, 4 Apr 2015 15:13:33 -0400 Subject: [PATCH] BUG: DataFrame.where does not respect axis parameter when shape is symmetric (GH #9736) --- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/core/generic.py | 20 ++++-- pandas/core/internals.py | 117 +++++++++++++++++--------------- pandas/tests/test_frame.py | 104 ++++++++++++++++++++++++++++ 4 files changed, 185 insertions(+), 57 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 14e185b5b2a26..94ae92811c4e6 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -88,6 +88,7 @@ Bug Fixes - Bug in ``ExcelReader`` when worksheet is empty (:issue:`6403`) +- Bug causing ``DataFrame.where`` to not respect the ``axis`` parameter when the frame has a symmetric shape. (:issue:`9736`) - Bug in ``Table.select_column`` where name is not preserved (:issue:`10392`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3bf998c1fa5a7..8742abc2d48b6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3634,19 +3634,31 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, else: other = self._constructor(other, **self._construct_axes_dict()) + if axis is None: + axis = 0 + + if self.ndim == getattr(other, 'ndim', 0): + align = True + else: + align = (self._get_axis_number(axis) == 1) + + block_axis = self._get_block_manager_axis(axis) + if inplace: # we may have different type blocks come out of putmask, so # reconstruct the block manager self._check_inplace_setting(other) - new_data = self._data.putmask(mask=cond, new=other, align=axis is None, - inplace=True) + new_data = self._data.putmask(mask=cond, new=other, align=align, + inplace=True, axis=block_axis, + transpose=self._AXIS_REVERSED) self._update_inplace(new_data) else: - new_data = self._data.where(other=other, cond=cond, align=axis is None, + new_data = self._data.where(other=other, cond=cond, align=align, raise_on_error=raise_on_error, - try_cast=try_cast) + try_cast=try_cast, axis=block_axis, + transpose=self._AXIS_REVERSED) return self._constructor(new_data).__finalize__(self) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 42d7163e7f741..196065703aa74 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -632,7 +632,8 @@ def _is_empty_indexer(indexer): return [self] - def putmask(self, mask, new, align=True, inplace=False): + def putmask(self, mask, new, align=True, inplace=False, + axis=0, transpose=False): """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -644,37 +645,55 @@ def putmask(self, mask, new, align=True, inplace=False): new : a ndarray/object align : boolean, perform alignment on other/cond, default is True inplace : perform inplace modification, default is False + axis : int + transpose : boolean + Set to True if self is stored with axes reversed Returns ------- - a new block(s), the result of the putmask + a list of new blocks, the result of the putmask """ new_values = self.values if inplace else self.values.copy() - # may need to align the new if hasattr(new, 'reindex_axis'): - new = new.values.T + new = new.values - # may need to align the mask if hasattr(mask, 'reindex_axis'): - mask = mask.values.T + mask = mask.values # if we are passed a scalar None, convert it here if not is_list_like(new) and isnull(new) and not self.is_object: new = self.fill_value if self._can_hold_element(new): + if transpose: + new_values = new_values.T + new = self._try_cast(new) - # pseudo-broadcast - if isinstance(new, np.ndarray) and new.ndim == self.ndim - 1: - new = np.repeat(new, self.shape[-1]).reshape(self.shape) + # If the default repeat behavior in np.putmask would go in the wrong + # direction, then explictly repeat and reshape new instead + if getattr(new, 'ndim', 0) >= 1: + if self.ndim - 1 == new.ndim and axis == 1: + new = np.repeat(new, new_values.shape[-1]).reshape(self.shape) np.putmask(new_values, mask, new) # maybe upcast me elif mask.any(): + if transpose: + mask = mask.T + if isinstance(new, np.ndarray): + new = new.T + axis = new_values.ndim - axis - 1 + + # Pseudo-broadcast + if getattr(new, 'ndim', 0) >= 1: + if self.ndim - 1 == new.ndim: + new_shape = list(new.shape) + new_shape.insert(axis, 1) + new = new.reshape(tuple(new_shape)) # need to go column by column new_blocks = [] @@ -685,14 +704,15 @@ def putmask(self, mask, new, align=True, inplace=False): # need a new block if m.any(): - - n = new[i] if isinstance( - new, np.ndarray) else np.array(new) + if isinstance(new, np.ndarray): + n = np.squeeze(new[i % new.shape[0]]) + else: + n = np.array(new) # type of the new block dtype, _ = com._maybe_promote(n.dtype) - # we need to exiplicty astype here to make a copy + # we need to explicitly astype here to make a copy n = n.astype(dtype) nv = _putmask_smart(v, m, n) @@ -718,8 +738,10 @@ def putmask(self, mask, new, align=True, inplace=False): if inplace: return [self] - return [make_block(new_values, - placement=self.mgr_locs, fastpath=True)] + if transpose: + new_values = new_values.T + + return [make_block(new_values, placement=self.mgr_locs, fastpath=True)] def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, @@ -1003,7 +1025,7 @@ def handle_error(): fastpath=True, placement=self.mgr_locs)] def where(self, other, cond, align=True, raise_on_error=True, - try_cast=False): + try_cast=False, axis=0, transpose=False): """ evaluate the block; return result block(s) from the result @@ -1014,6 +1036,9 @@ def where(self, other, cond, align=True, raise_on_error=True, align : boolean, perform alignment on other/cond raise_on_error : if True, raise when I can't perform the function, False by default (and just return the data that we had coming in) + axis : int + transpose : boolean + Set to True if self is stored with axes reversed Returns ------- @@ -1021,43 +1046,23 @@ def where(self, other, cond, align=True, raise_on_error=True, """ values = self.values + if transpose: + values = values.T - # see if we can align other if hasattr(other, 'reindex_axis'): other = other.values - # make sure that we can broadcast - is_transposed = False - if hasattr(other, 'ndim') and hasattr(values, 'ndim'): - if values.ndim != other.ndim or values.shape == other.shape[::-1]: - - # if its symmetric are ok, no reshaping needed (GH 7506) - if (values.shape[0] == np.array(values.shape)).all(): - pass - - # pseodo broadcast (its a 2d vs 1d say and where needs it in a - # specific direction) - elif (other.ndim >= 1 and values.ndim - 1 == other.ndim and - values.shape[0] != other.shape[0]): - other = _block_shape(other).T - else: - values = values.T - is_transposed = True - - # see if we can align cond - if not hasattr(cond, 'shape'): - raise ValueError( - "where must have a condition that is ndarray like") - if hasattr(cond, 'reindex_axis'): cond = cond.values - # may need to undo transpose of values - if hasattr(values, 'ndim'): - if values.ndim != cond.ndim or values.shape == cond.shape[::-1]: + # If the default broadcasting would go in the wrong direction, then + # explictly reshape other instead + if getattr(other, 'ndim', 0) >= 1: + if values.ndim - 1 == other.ndim and axis == 1: + other = other.reshape(tuple(other.shape + (1,))) - values = values.T - is_transposed = not is_transposed + if not hasattr(cond, 'shape'): + raise ValueError("where must have a condition that is ndarray like") other = _maybe_convert_string_to_object(other) @@ -1090,15 +1095,14 @@ def func(c, v, o): raise TypeError('Could not compare [%s] with block values' % repr(other)) - if is_transposed: + if transpose: result = result.T # try to cast if requested if try_cast: result = self._try_cast_result(result) - return make_block(result, - ndim=self.ndim, placement=self.mgr_locs) + return make_block(result, ndim=self.ndim, placement=self.mgr_locs) # might need to separate out blocks axis = cond.ndim - 1 @@ -1733,7 +1737,8 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) - def putmask(self, mask, new, align=True, inplace=False): + def putmask(self, mask, new, align=True, inplace=False, + axis=0, transpose=False): """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -2425,12 +2430,18 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, **kwargs): else: kwargs['filter'] = filter_locs - if f == 'where' and kwargs.get('align', True): + if f == 'where': align_copy = True - align_keys = ['other', 'cond'] - elif f == 'putmask' and kwargs.get('align', True): + if kwargs.get('align', True): + align_keys = ['other', 'cond'] + else: + align_keys = ['cond'] + elif f == 'putmask': align_copy = False - align_keys = ['new', 'mask'] + if kwargs.get('align', True): + align_keys = ['new', 'mask'] + else: + align_keys = ['mask'] elif f == 'eval': align_copy = False align_keys = ['other'] diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index cfa0ed1a11772..6e82f20f325cb 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10046,6 +10046,110 @@ def test_where_complex(self): df[df.abs() >= 5] = np.nan assert_frame_equal(df,expected) + def test_where_axis(self): + # GH 9736 + df = DataFrame(np.random.randn(2, 2)) + mask = DataFrame([[False, False], [False, False]]) + s = Series([0, 1]) + + expected = DataFrame([[0, 0], [1, 1]], dtype='float64') + result = df.where(mask, s, axis='index') + assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis='index', inplace=True) + assert_frame_equal(result, expected) + + expected = DataFrame([[0, 1], [0, 1]], dtype='float64') + result = df.where(mask, s, axis='columns') + assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis='columns', inplace=True) + assert_frame_equal(result, expected) + + # Upcast needed + df = DataFrame([[1, 2], [3, 4]], dtype='int64') + mask = DataFrame([[False, False], [False, False]]) + s = Series([0, np.nan]) + + expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype='float64') + result = df.where(mask, s, axis='index') + assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis='index', inplace=True) + assert_frame_equal(result, expected) + + expected = DataFrame([[0, np.nan], [0, np.nan]], dtype='float64') + result = df.where(mask, s, axis='columns') + assert_frame_equal(result, expected) + + expected = DataFrame({0 : np.array([0, 0], dtype='int64'), + 1 : np.array([np.nan, np.nan], dtype='float64')}) + result = df.copy() + result.where(mask, s, axis='columns', inplace=True) + assert_frame_equal(result, expected) + + # Multiple dtypes (=> multiple Blocks) + df = pd.concat([DataFrame(np.random.randn(10, 2)), + DataFrame(np.random.randint(0, 10, size=(10, 2)))], + ignore_index=True, axis=1) + mask = DataFrame(False, columns=df.columns, index=df.index) + s1 = Series(1, index=df.columns) + s2 = Series(2, index=df.index) + + result = df.where(mask, s1, axis='columns') + expected = DataFrame(1.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype(int) + expected[3] = expected[3].astype(int) + assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s1, axis='columns', inplace=True) + assert_frame_equal(result, expected) + + result = df.where(mask, s2, axis='index') + expected = DataFrame(2.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype(int) + expected[3] = expected[3].astype(int) + assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s2, axis='index', inplace=True) + assert_frame_equal(result, expected) + + # DataFrame vs DataFrame + d1 = df.copy().drop(1, axis=0) + expected = df.copy() + expected.loc[1, :] = np.nan + + result = df.where(mask, d1) + assert_frame_equal(result, expected) + result = df.where(mask, d1, axis='index') + assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d1, inplace=True) + assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d1, inplace=True, axis='index') + assert_frame_equal(result, expected) + + d2 = df.copy().drop(1, axis=1) + expected = df.copy() + expected.loc[:, 1] = np.nan + + result = df.where(mask, d2) + assert_frame_equal(result, expected) + result = df.where(mask, d2, axis='columns') + assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d2, inplace=True) + assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d2, inplace=True, axis='columns') + assert_frame_equal(result, expected) + def test_mask(self): df = DataFrame(np.random.randn(5, 3)) cond = df > 0