Skip to content

ENH: Exclude nuisance columns from result of window functions #27044

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jul 1, 2019
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,7 @@ Groupby/resample/rolling
- Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`)
- Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`)
- Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`)
- Improved :class:`pandas.core.window.Rolling`, :class:`pandas.core.window.Window` and :class:`pandas.core.window.EWM` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`)

Reshaping
^^^^^^^^^
Expand Down
84 changes: 63 additions & 21 deletions pandas/core/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
ABCDataFrame, ABCDateOffset, ABCDatetimeIndex, ABCPeriodIndex, ABCSeries,
ABCTimedeltaIndex)

from pandas.core.base import PandasObject, SelectionMixin
from pandas.core.base import DataError, PandasObject, SelectionMixin
import pandas.core.common as com
from pandas.core.generic import _shared_docs
from pandas.core.groupby.base import GroupByMixin
Expand Down Expand Up @@ -112,9 +112,9 @@ def _create_blocks(self):
if obj.ndim == 2:
obj = obj.reindex(columns=obj.columns.difference([self.on]),
copy=False)
blocks = obj._to_dict_of_blocks(copy=False).values()
blocks_dict = obj._to_dict_of_blocks(copy=False)

return blocks, obj, index
return blocks_dict, obj, index

def _gotitem(self, key, ndim, subset=None):
"""
Expand Down Expand Up @@ -243,7 +243,7 @@ def _wrap_result(self, result, block=None, obj=None):
return type(obj)(result, index=index, columns=block.columns)
return result

def _wrap_results(self, results, blocks, obj):
def _wrap_results(self, results, blocks, obj, exclude=None):
"""
Wrap the results.

Expand All @@ -252,6 +252,7 @@ def _wrap_results(self, results, blocks, obj):
results : list of ndarrays
blocks : list of blocks
obj : conformed data (may be resampled)
exclude: list of columns to exclude, default to None
"""

from pandas import Series, concat
Expand Down Expand Up @@ -285,6 +286,13 @@ def _wrap_results(self, results, blocks, obj):
indexer = columns.get_indexer(selection.tolist() + [name])
columns = columns.take(sorted(indexer))

# exclude nuisance columns so that they are not reindexed
if exclude is not None and exclude:
columns = [c for c in columns if c not in exclude]

if not columns:
raise DataError('No numeric types to aggregate')

if not len(final):
return obj.astype('float64')
return concat(final, axis=1).reindex(columns=columns, copy=False)
Expand Down Expand Up @@ -671,14 +679,24 @@ def _apply_window(self, mean=True, **kwargs):
window = self._prep_window(**kwargs)
center = self.center

blocks, obj, index = self._create_blocks()
blocks_dict, obj, index = self._create_blocks()
dtypes = blocks_dict.keys()
blocks = blocks_dict.values()

results = []
for b in blocks:
exclude = []
for dtype in list(dtypes):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

best just to

for b in bocks_dict.values():
    .....

then don't need anything else

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since my solution to unordered dict issue requires deleting nuisance blocks (block of columns with same type) I needed a shallow copy of keys to remove them iteratively

b = blocks_dict[dtype]
try:
values = self._prep_values(b.values)
except TypeError:
results.append(b.values.copy())
continue

except (TypeError, NotImplementedError):
if isinstance(obj, ABCDataFrame):
exclude.extend(b.columns)
del blocks_dict[dtype]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you del here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As you stated the order for dictionary differs in each run. So iteration order for blocks differs.

So in case of a DataFrame with columns ["A", "B"] with types [int, str] when iteration starts with "B" then:

results = [values_of_A] but blocks = [block_for_B, block_for_A]. There seems to be a mismatch here (values_of_A and block_for_B are iterated together). So I went with removing block_for_B which was the first thing came to my mind.

continue
else:
raise DataError('No numeric types to aggregate')

if values.size == 0:
results.append(values.copy())
Expand All @@ -700,7 +718,7 @@ def f(arg, *args, **kwargs):
result = self._center_window(result, window)
results.append(result)

return self._wrap_results(results, blocks, obj)
return self._wrap_results(results, blocks, obj, exclude)

_agg_see_also_doc = dedent("""
See Also
Expand Down Expand Up @@ -842,11 +860,25 @@ def _apply(self, func, name=None, window=None, center=None,
if check_minp is None:
check_minp = _use_window

blocks, obj, index = self._create_blocks()
blocks_dict, obj, index = self._create_blocks()
dtypes = blocks_dict.keys()
blocks = blocks_dict.values()
index, indexi = self._get_index(index=index)

results = []
for b in blocks:
values = self._prep_values(b.values)
exclude = []
for dtype in list(dtypes):
b = blocks_dict[dtype]
try:
values = self._prep_values(b.values)

except (TypeError, NotImplementedError):
if isinstance(obj, ABCDataFrame):
exclude.extend(b.columns)
del blocks_dict[dtype]
continue
else:
raise DataError('No numeric types to aggregate')

if values.size == 0:
results.append(values.copy())
Expand Down Expand Up @@ -892,7 +924,7 @@ def calc(x):

results.append(result)

return self._wrap_results(results, blocks, obj)
return self._wrap_results(results, blocks, obj, exclude)


class _Rolling_and_Expanding(_Rolling):
Expand Down Expand Up @@ -937,7 +969,8 @@ class _Rolling_and_Expanding(_Rolling):

def count(self):

blocks, obj, index = self._create_blocks()
blocks_dict, obj, index = self._create_blocks()
blocks = blocks_dict.values()
# Validate the index
self._get_index(index=index)

Expand Down Expand Up @@ -2290,14 +2323,23 @@ def _apply(self, func, **kwargs):
-------
y : same type as input argument
"""
blocks, obj, index = self._create_blocks()
blocks_dict, obj, index = self._create_blocks()
dtypes = blocks_dict.keys()
blocks = blocks_dict.values()

results = []
for b in blocks:
exclude = []
for dtype in list(dtypes):
b = blocks_dict[dtype]
try:
values = self._prep_values(b.values)
except TypeError:
results.append(b.values.copy())
continue
except (TypeError, NotImplementedError):
if isinstance(obj, ABCDataFrame):
exclude.extend(b.columns)
del blocks_dict[dtype]
continue
else:
raise DataError('No numeric types to aggregate')

if values.size == 0:
results.append(values.copy())
Expand All @@ -2316,7 +2358,7 @@ def func(arg):

results.append(np.apply_along_axis(func, self.axis, values))

return self._wrap_results(results, blocks, obj)
return self._wrap_results(results, blocks, obj, exclude)

@Substitution(name='ewm')
@Appender(_doc_template)
Expand Down
15 changes: 7 additions & 8 deletions pandas/tests/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pandas as pd
from pandas import (
DataFrame, Index, Series, Timestamp, bdate_range, concat, isna, notna)
from pandas.core.base import SpecificationError
from pandas.core.base import DataError, SpecificationError
from pandas.core.sorting import safe_sort
import pandas.core.window as rwindow
import pandas.util.testing as tm
Expand Down Expand Up @@ -118,9 +118,11 @@ def tests_skip_nuisance(self):
def test_skip_sum_object_raises(self):
df = DataFrame({'A': range(5), 'B': range(5, 10), 'C': 'foo'})
r = df.rolling(window=3)

with pytest.raises(TypeError, match='cannot handle this type'):
r.sum()
result = r.sum()
expected = DataFrame({'A': [np.nan, np.nan, 3, 6, 9],
'B': [np.nan, np.nan, 18, 21, 24]},
columns=list('AB'))
tm.assert_frame_equal(result, expected)

def test_agg(self):
df = DataFrame({'A': range(5), 'B': range(0, 10, 2)})
Expand Down Expand Up @@ -1069,15 +1071,12 @@ class DatetimeLike(Dtype):
def check_dtypes(self, f, f_name, d, d_name, exp):

roll = d.rolling(window=self.window)

if f_name == 'count':
result = f(roll)
tm.assert_almost_equal(result, exp)

else:

# other methods not Implemented ATM
with pytest.raises(NotImplementedError):
with pytest.raises(DataError):
f(roll)


Expand Down