Skip to content

PERF: remove use of Panel & perf in rolling corr/cov #19257

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions asv_bench/benchmarks/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ class Methods(object):
[10, 1000],
['int', 'float'],
['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
'sum', 'corr', 'cov'])
param_names = ['constructor', 'window', 'dtype', 'method']
'sum'])
param_names = ['contructor', 'window', 'dtype', 'method']

def setup(self, constructor, window, dtype, method):
N = 10**5
Expand All @@ -23,6 +23,27 @@ def time_rolling(self, constructor, window, dtype, method):
getattr(self.roll, method)()


class Pairwise(object):

sample_time = 0.2
params = ([10, 1000, None],
['corr', 'cov'],
[True, False])
param_names = ['window', 'method', 'pairwise']

def setup(self, window, method, pairwise):
N = 10**4
arr = np.random.random(N)
self.df = pd.DataFrame(arr)

def time_pairwise(self, window, method, pairwise):
if window is None:
r = self.df.expanding()
else:
r = self.df.rolling(window=window)
getattr(r, method)(self.df, pairwise=pairwise)


class Quantile(object):

sample_time = 0.2
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ Performance Improvements
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)

- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`)

.. _whatsnew_0230.docs:

Expand Down
8 changes: 2 additions & 6 deletions pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,19 +99,15 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',

if not dropna:
from pandas import MultiIndex
try:
if table.index.nlevels > 1:
m = MultiIndex.from_arrays(cartesian_product(table.index.levels),
names=table.index.names)
table = table.reindex(m, axis=0)
except AttributeError:
pass # it's a single level

try:
if table.columns.nlevels > 1:
m = MultiIndex.from_arrays(cartesian_product(table.columns.levels),
names=table.columns.names)
table = table.reindex(m, axis=1)
except AttributeError:
pass # it's a single level or a series

if isinstance(table, ABCDataFrame):
table = table.sort_index(axis=1)
Expand Down
47 changes: 30 additions & 17 deletions pandas/core/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -1863,25 +1863,38 @@ def dataframe_from_int_dict(data, frame_template):
results[i][j] = f(*_prep_binary(arg1.iloc[:, i],
arg2.iloc[:, j]))

# TODO: not the most efficient (perf-wise)
# though not bad code-wise
from pandas import Panel, MultiIndex, concat

with warnings.catch_warnings(record=True):
p = Panel.from_dict(results).swapaxes('items', 'major')
if len(p.major_axis) > 0:
p.major_axis = arg1.columns[p.major_axis]
if len(p.minor_axis) > 0:
p.minor_axis = arg2.columns[p.minor_axis]

if len(p.items):
from pandas import MultiIndex, concat

result_index = arg1.index.union(arg2.index)
if len(result_index):

# construct result frame
result = concat(
[p.iloc[i].T for i in range(len(p.items))],
keys=p.items)
[concat([results[i][j]
for j, c in enumerate(arg2.columns)],
ignore_index=True)
for i, c in enumerate(arg1.columns)],
ignore_index=True,
axis=1)
result.columns = arg1.columns

# set the index and reorder
if arg2.columns.nlevels > 1:
result.index = MultiIndex.from_product(
arg2.columns.levels + [result_index])
result = result.reorder_levels([2, 0, 1]).sort_index()
else:
result.index = MultiIndex.from_product(
[range(len(arg2.columns)),
range(len(result_index))])
result = result.swaplevel(1, 0).sort_index()
result.index = MultiIndex.from_product(
[result_index] + [arg2.columns])
else:

# empty result
result = DataFrame(
index=MultiIndex(levels=[arg1.index, arg1.columns],
index=MultiIndex(levels=[arg1.index, arg2.columns],
labels=[[], []]),
columns=arg2.columns,
dtype='float64')
Expand All @@ -1890,9 +1903,9 @@ def dataframe_from_int_dict(data, frame_template):
# reset our column names to arg2 names
# careful not to mutate the original names
result.columns = result.columns.set_names(
arg2.columns.names)
arg1.columns.names)
result.index = result.index.set_names(
arg1.index.names + arg1.columns.names)
result_index.names + arg2.columns.names)

return result

Expand Down
22 changes: 13 additions & 9 deletions pandas/tests/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import pandas.tseries.offsets as offsets
from pandas.core.base import SpecificationError
from pandas.errors import UnsupportedFunctionCall
from pandas.core.sorting import safe_sort
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas.compat import range, zip
Expand Down Expand Up @@ -1645,7 +1646,7 @@ def compare(self, result, expected):
result = result.dropna().values
expected = expected.dropna().values

tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result, expected, check_dtype=False)

@pytest.mark.parametrize('f', [lambda x: x.cov(), lambda x: x.corr()])
def test_no_flex(self, f):
Expand All @@ -1670,15 +1671,19 @@ def test_no_flex(self, f):
def test_pairwise_with_self(self, f):

# DataFrame with itself, pairwise=True
results = [f(df) for df in self.df1s]
for (df, result) in zip(self.df1s, results):
# note that we may construct the 1st level of the MI
# in a non-motononic way, so compare accordingly
results = []
for i, df in enumerate(self.df1s):
result = f(df)
tm.assert_index_equal(result.index.levels[0],
df.index,
check_names=False)
tm.assert_index_equal(result.index.levels[1],
df.columns,
check_names=False)
tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]),
safe_sort(df.columns.unique()))
tm.assert_index_equal(result.columns, df.columns)
results.append(df)

for i, result in enumerate(results):
if i > 0:
self.compare(result, results[0])
Expand Down Expand Up @@ -1716,9 +1721,8 @@ def test_pairwise_with_other(self, f):
tm.assert_index_equal(result.index.levels[0],
df.index,
check_names=False)
tm.assert_index_equal(result.index.levels[1],
self.df2.columns,
check_names=False)
tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]),
safe_sort(self.df2.columns.unique()))
for i, result in enumerate(results):
if i > 0:
self.compare(result, results[0])
Expand Down