Skip to content

BUG: DataFrame.where does not respect axis parameter when shape is symmetric #9838

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ Bug Fixes

- Bug in ``ExcelReader`` when worksheet is empty (:issue:`6403`)

- Bug causing ``DataFrame.where`` to not respect the ``axis`` parameter when the frame has a symmetric shape. (:issue:`9736`)

- Bug in ``Table.select_column`` where name is not preserved (:issue:`10392`)

Expand Down
20 changes: 16 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3634,19 +3634,31 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
else:
other = self._constructor(other, **self._construct_axes_dict())

if axis is None:
axis = 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isnt' axis=0 as the default, then align = axis == 0 simpler?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was trying to keep the original behavior. Your suggestion might cause an extra re-indexing if axis is set explictly to 0, but I can't find any situation where it gives a different result.

EDIT: Actually, it looks like both of these options are just wrong, and break in some cases where the DataFrame has multiple blocks. I'll have to take a closer look.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you added this note a while back. can you confirm what is going on?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the comment below:

"The original behavior was to align cond and other with each Block along the 'columns' axis only when no axis parameter was passed to where (which can only happen in the case that other is also a DataFrame), but that doesn't work in some cases. The correct behavior seems to be: always align cond (because each Block only contains a subset of the columns), and align other whenever it has the same ndim as self, or if the alignment axis is 'columns': basically, every case except DataFrame.where(cond, Series, axis='index')

This is all fixed in the most recent commit."


if self.ndim == getattr(other, 'ndim', 0):
align = True
else:
align = (self._get_axis_number(axis) == 1)

block_axis = self._get_block_manager_axis(axis)

if inplace:
# we may have different type blocks come out of putmask, so
# reconstruct the block manager

self._check_inplace_setting(other)
new_data = self._data.putmask(mask=cond, new=other, align=axis is None,
inplace=True)
new_data = self._data.putmask(mask=cond, new=other, align=align,
inplace=True, axis=block_axis,
transpose=self._AXIS_REVERSED)
self._update_inplace(new_data)

else:
new_data = self._data.where(other=other, cond=cond, align=axis is None,
new_data = self._data.where(other=other, cond=cond, align=align,
raise_on_error=raise_on_error,
try_cast=try_cast)
try_cast=try_cast, axis=block_axis,
transpose=self._AXIS_REVERSED)

return self._constructor(new_data).__finalize__(self)

Expand Down
117 changes: 64 additions & 53 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,8 @@ def _is_empty_indexer(indexer):

return [self]

def putmask(self, mask, new, align=True, inplace=False):
def putmask(self, mask, new, align=True, inplace=False,
axis=0, transpose=False):
""" putmask the data to the block; it is possible that we may create a
new dtype of block

Expand All @@ -644,37 +645,55 @@ def putmask(self, mask, new, align=True, inplace=False):
new : a ndarray/object
align : boolean, perform alignment on other/cond, default is True
inplace : perform inplace modification, default is False
axis : int
transpose : boolean
Set to True if self is stored with axes reversed

Returns
-------
a new block(s), the result of the putmask
a list of new blocks, the result of the putmask
"""

new_values = self.values if inplace else self.values.copy()

# may need to align the new
if hasattr(new, 'reindex_axis'):
new = new.values.T
new = new.values

# may need to align the mask
if hasattr(mask, 'reindex_axis'):
mask = mask.values.T
mask = mask.values

# if we are passed a scalar None, convert it here
if not is_list_like(new) and isnull(new) and not self.is_object:
new = self.fill_value

if self._can_hold_element(new):
if transpose:
new_values = new_values.T

new = self._try_cast(new)

# pseudo-broadcast
if isinstance(new, np.ndarray) and new.ndim == self.ndim - 1:
new = np.repeat(new, self.shape[-1]).reshape(self.shape)
# If the default repeat behavior in np.putmask would go in the wrong
# direction, then explictly repeat and reshape new instead
if getattr(new, 'ndim', 0) >= 1:
if self.ndim - 1 == new.ndim and axis == 1:
new = np.repeat(new, new_values.shape[-1]).reshape(self.shape)

np.putmask(new_values, mask, new)

# maybe upcast me
elif mask.any():
if transpose:
mask = mask.T
if isinstance(new, np.ndarray):
new = new.T
axis = new_values.ndim - axis - 1

# Pseudo-broadcast
if getattr(new, 'ndim', 0) >= 1:
if self.ndim - 1 == new.ndim:
new_shape = list(new.shape)
new_shape.insert(axis, 1)
new = new.reshape(tuple(new_shape))

# need to go column by column
new_blocks = []
Expand All @@ -685,14 +704,15 @@ def putmask(self, mask, new, align=True, inplace=False):

# need a new block
if m.any():

n = new[i] if isinstance(
new, np.ndarray) else np.array(new)
if isinstance(new, np.ndarray):
n = np.squeeze(new[i % new.shape[0]])
else:
n = np.array(new)

# type of the new block
dtype, _ = com._maybe_promote(n.dtype)

# we need to exiplicty astype here to make a copy
# we need to explicitly astype here to make a copy
n = n.astype(dtype)

nv = _putmask_smart(v, m, n)
Expand All @@ -718,8 +738,10 @@ def putmask(self, mask, new, align=True, inplace=False):
if inplace:
return [self]

return [make_block(new_values,
placement=self.mgr_locs, fastpath=True)]
if transpose:
new_values = new_values.T

return [make_block(new_values, placement=self.mgr_locs, fastpath=True)]

def interpolate(self, method='pad', axis=0, index=None,
values=None, inplace=False, limit=None,
Expand Down Expand Up @@ -1003,7 +1025,7 @@ def handle_error():
fastpath=True, placement=self.mgr_locs)]

def where(self, other, cond, align=True, raise_on_error=True,
try_cast=False):
try_cast=False, axis=0, transpose=False):
"""
evaluate the block; return result block(s) from the result

Expand All @@ -1014,50 +1036,33 @@ def where(self, other, cond, align=True, raise_on_error=True,
align : boolean, perform alignment on other/cond
raise_on_error : if True, raise when I can't perform the function,
False by default (and just return the data that we had coming in)
axis : int
transpose : boolean
Set to True if self is stored with axes reversed

Returns
-------
a new block(s), the result of the func
"""

values = self.values
if transpose:
values = values.T

# see if we can align other
if hasattr(other, 'reindex_axis'):
other = other.values

# make sure that we can broadcast
is_transposed = False
if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
if values.ndim != other.ndim or values.shape == other.shape[::-1]:

# if its symmetric are ok, no reshaping needed (GH 7506)
if (values.shape[0] == np.array(values.shape)).all():
pass

# pseodo broadcast (its a 2d vs 1d say and where needs it in a
# specific direction)
elif (other.ndim >= 1 and values.ndim - 1 == other.ndim and
values.shape[0] != other.shape[0]):
other = _block_shape(other).T
else:
values = values.T
is_transposed = True

# see if we can align cond
if not hasattr(cond, 'shape'):
raise ValueError(
"where must have a condition that is ndarray like")

if hasattr(cond, 'reindex_axis'):
cond = cond.values

# may need to undo transpose of values
if hasattr(values, 'ndim'):
if values.ndim != cond.ndim or values.shape == cond.shape[::-1]:
# If the default broadcasting would go in the wrong direction, then
# explictly reshape other instead
if getattr(other, 'ndim', 0) >= 1:
if values.ndim - 1 == other.ndim and axis == 1:
other = other.reshape(tuple(other.shape + (1,)))

values = values.T
is_transposed = not is_transposed
if not hasattr(cond, 'shape'):
raise ValueError("where must have a condition that is ndarray like")

other = _maybe_convert_string_to_object(other)

Expand Down Expand Up @@ -1090,15 +1095,14 @@ def func(c, v, o):
raise TypeError('Could not compare [%s] with block values'
% repr(other))

if is_transposed:
if transpose:
result = result.T

# try to cast if requested
if try_cast:
result = self._try_cast_result(result)

return make_block(result,
ndim=self.ndim, placement=self.mgr_locs)
return make_block(result, ndim=self.ndim, placement=self.mgr_locs)

# might need to separate out blocks
axis = cond.ndim - 1
Expand Down Expand Up @@ -1733,7 +1737,8 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):

return self.make_block_same_class(new_values, new_mgr_locs)

def putmask(self, mask, new, align=True, inplace=False):
def putmask(self, mask, new, align=True, inplace=False,
axis=0, transpose=False):
""" putmask the data to the block; it is possible that we may create a
new dtype of block

Expand Down Expand Up @@ -2425,12 +2430,18 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, **kwargs):
else:
kwargs['filter'] = filter_locs

if f == 'where' and kwargs.get('align', True):
if f == 'where':
align_copy = True
align_keys = ['other', 'cond']
elif f == 'putmask' and kwargs.get('align', True):
if kwargs.get('align', True):
align_keys = ['other', 'cond']
else:
align_keys = ['cond']
elif f == 'putmask':
align_copy = False
align_keys = ['new', 'mask']
if kwargs.get('align', True):
align_keys = ['new', 'mask']
else:
align_keys = ['mask']
elif f == 'eval':
align_copy = False
align_keys = ['other']
Expand Down
104 changes: 104 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10046,6 +10046,110 @@ def test_where_complex(self):
df[df.abs() >= 5] = np.nan
assert_frame_equal(df,expected)

def test_where_axis(self):
# GH 9736
df = DataFrame(np.random.randn(2, 2))
mask = DataFrame([[False, False], [False, False]])
s = Series([0, 1])

expected = DataFrame([[0, 0], [1, 1]], dtype='float64')
result = df.where(mask, s, axis='index')
assert_frame_equal(result, expected)

result = df.copy()
result.where(mask, s, axis='index', inplace=True)
assert_frame_equal(result, expected)

expected = DataFrame([[0, 1], [0, 1]], dtype='float64')
result = df.where(mask, s, axis='columns')
assert_frame_equal(result, expected)

result = df.copy()
result.where(mask, s, axis='columns', inplace=True)
assert_frame_equal(result, expected)

# Upcast needed
df = DataFrame([[1, 2], [3, 4]], dtype='int64')
mask = DataFrame([[False, False], [False, False]])
s = Series([0, np.nan])

expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype='float64')
result = df.where(mask, s, axis='index')
assert_frame_equal(result, expected)

result = df.copy()
result.where(mask, s, axis='index', inplace=True)
assert_frame_equal(result, expected)

expected = DataFrame([[0, np.nan], [0, np.nan]], dtype='float64')
result = df.where(mask, s, axis='columns')
assert_frame_equal(result, expected)

expected = DataFrame({0 : np.array([0, 0], dtype='int64'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test that has several dtypes, (we may have somewhere else), but for completeness. e.g. something like:

pd.concat([DataFrame(np.random.randn(10,2)),DataFrame(np.random.randint(0,10,size=20).reshape(10,2))],ignore_index=True, axis=1)

1 : np.array([np.nan, np.nan], dtype='float64')})
result = df.copy()
result.where(mask, s, axis='columns', inplace=True)
assert_frame_equal(result, expected)

# Multiple dtypes (=> multiple Blocks)
df = pd.concat([DataFrame(np.random.randn(10, 2)),
DataFrame(np.random.randint(0, 10, size=(10, 2)))],
ignore_index=True, axis=1)
mask = DataFrame(False, columns=df.columns, index=df.index)
s1 = Series(1, index=df.columns)
s2 = Series(2, index=df.index)

result = df.where(mask, s1, axis='columns')
expected = DataFrame(1.0, columns=df.columns, index=df.index)
expected[2] = expected[2].astype(int)
expected[3] = expected[3].astype(int)
assert_frame_equal(result, expected)

result = df.copy()
result.where(mask, s1, axis='columns', inplace=True)
assert_frame_equal(result, expected)

result = df.where(mask, s2, axis='index')
expected = DataFrame(2.0, columns=df.columns, index=df.index)
expected[2] = expected[2].astype(int)
expected[3] = expected[3].astype(int)
assert_frame_equal(result, expected)

result = df.copy()
result.where(mask, s2, axis='index', inplace=True)
assert_frame_equal(result, expected)

# DataFrame vs DataFrame
d1 = df.copy().drop(1, axis=0)
expected = df.copy()
expected.loc[1, :] = np.nan

result = df.where(mask, d1)
assert_frame_equal(result, expected)
result = df.where(mask, d1, axis='index')
assert_frame_equal(result, expected)
result = df.copy()
result.where(mask, d1, inplace=True)
assert_frame_equal(result, expected)
result = df.copy()
result.where(mask, d1, inplace=True, axis='index')
assert_frame_equal(result, expected)

d2 = df.copy().drop(1, axis=1)
expected = df.copy()
expected.loc[:, 1] = np.nan

result = df.where(mask, d2)
assert_frame_equal(result, expected)
result = df.where(mask, d2, axis='columns')
assert_frame_equal(result, expected)
result = df.copy()
result.where(mask, d2, inplace=True)
assert_frame_equal(result, expected)
result = df.copy()
result.where(mask, d2, inplace=True, axis='columns')
assert_frame_equal(result, expected)

def test_mask(self):
df = DataFrame(np.random.randn(5, 3))
cond = df > 0
Expand Down