Skip to content

Commit b5dd6a3

Browse files
authored
PERF: remove use of Panel & perf in rolling corr/cov (#19257)
* PERF: remove use of Panel & perf in rolling corr/cov closes #17917
1 parent 4eb0cec commit b5dd6a3

File tree

5 files changed

+69
-35
lines changed

5 files changed

+69
-35
lines changed

asv_bench/benchmarks/rolling.py

+23-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ class Methods(object):
1111
[10, 1000],
1212
['int', 'float'],
1313
['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
14-
'sum', 'corr', 'cov'])
15-
param_names = ['constructor', 'window', 'dtype', 'method']
14+
'sum'])
15+
param_names = ['contructor', 'window', 'dtype', 'method']
1616

1717
def setup(self, constructor, window, dtype, method):
1818
N = 10**5
@@ -23,6 +23,27 @@ def time_rolling(self, constructor, window, dtype, method):
2323
getattr(self.roll, method)()
2424

2525

26+
class Pairwise(object):
27+
28+
sample_time = 0.2
29+
params = ([10, 1000, None],
30+
['corr', 'cov'],
31+
[True, False])
32+
param_names = ['window', 'method', 'pairwise']
33+
34+
def setup(self, window, method, pairwise):
35+
N = 10**4
36+
arr = np.random.random(N)
37+
self.df = pd.DataFrame(arr)
38+
39+
def time_pairwise(self, window, method, pairwise):
40+
if window is None:
41+
r = self.df.expanding()
42+
else:
43+
r = self.df.rolling(window=window)
44+
getattr(r, method)(self.df, pairwise=pairwise)
45+
46+
2647
class Quantile(object):
2748

2849
sample_time = 0.2

doc/source/whatsnew/v0.23.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ Performance Improvements
383383
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
384384
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
385385
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
386-
386+
- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`)
387387

388388
.. _whatsnew_0230.docs:
389389

pandas/core/reshape/pivot.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -99,19 +99,15 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
9999

100100
if not dropna:
101101
from pandas import MultiIndex
102-
try:
102+
if table.index.nlevels > 1:
103103
m = MultiIndex.from_arrays(cartesian_product(table.index.levels),
104104
names=table.index.names)
105105
table = table.reindex(m, axis=0)
106-
except AttributeError:
107-
pass # it's a single level
108106

109-
try:
107+
if table.columns.nlevels > 1:
110108
m = MultiIndex.from_arrays(cartesian_product(table.columns.levels),
111109
names=table.columns.names)
112110
table = table.reindex(m, axis=1)
113-
except AttributeError:
114-
pass # it's a single level or a series
115111

116112
if isinstance(table, ABCDataFrame):
117113
table = table.sort_index(axis=1)

pandas/core/window.py

+30-17
Original file line numberDiff line numberDiff line change
@@ -1863,25 +1863,38 @@ def dataframe_from_int_dict(data, frame_template):
18631863
results[i][j] = f(*_prep_binary(arg1.iloc[:, i],
18641864
arg2.iloc[:, j]))
18651865

1866-
# TODO: not the most efficient (perf-wise)
1867-
# though not bad code-wise
1868-
from pandas import Panel, MultiIndex, concat
1869-
1870-
with warnings.catch_warnings(record=True):
1871-
p = Panel.from_dict(results).swapaxes('items', 'major')
1872-
if len(p.major_axis) > 0:
1873-
p.major_axis = arg1.columns[p.major_axis]
1874-
if len(p.minor_axis) > 0:
1875-
p.minor_axis = arg2.columns[p.minor_axis]
1876-
1877-
if len(p.items):
1866+
from pandas import MultiIndex, concat
1867+
1868+
result_index = arg1.index.union(arg2.index)
1869+
if len(result_index):
1870+
1871+
# construct result frame
18781872
result = concat(
1879-
[p.iloc[i].T for i in range(len(p.items))],
1880-
keys=p.items)
1873+
[concat([results[i][j]
1874+
for j, c in enumerate(arg2.columns)],
1875+
ignore_index=True)
1876+
for i, c in enumerate(arg1.columns)],
1877+
ignore_index=True,
1878+
axis=1)
1879+
result.columns = arg1.columns
1880+
1881+
# set the index and reorder
1882+
if arg2.columns.nlevels > 1:
1883+
result.index = MultiIndex.from_product(
1884+
arg2.columns.levels + [result_index])
1885+
result = result.reorder_levels([2, 0, 1]).sort_index()
1886+
else:
1887+
result.index = MultiIndex.from_product(
1888+
[range(len(arg2.columns)),
1889+
range(len(result_index))])
1890+
result = result.swaplevel(1, 0).sort_index()
1891+
result.index = MultiIndex.from_product(
1892+
[result_index] + [arg2.columns])
18811893
else:
18821894

1895+
# empty result
18831896
result = DataFrame(
1884-
index=MultiIndex(levels=[arg1.index, arg1.columns],
1897+
index=MultiIndex(levels=[arg1.index, arg2.columns],
18851898
labels=[[], []]),
18861899
columns=arg2.columns,
18871900
dtype='float64')
@@ -1890,9 +1903,9 @@ def dataframe_from_int_dict(data, frame_template):
18901903
# reset our column names to arg2 names
18911904
# careful not to mutate the original names
18921905
result.columns = result.columns.set_names(
1893-
arg2.columns.names)
1906+
arg1.columns.names)
18941907
result.index = result.index.set_names(
1895-
arg1.index.names + arg1.columns.names)
1908+
result_index.names + arg2.columns.names)
18961909

18971910
return result
18981911

pandas/tests/test_window.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import pandas.tseries.offsets as offsets
1515
from pandas.core.base import SpecificationError
1616
from pandas.errors import UnsupportedFunctionCall
17+
from pandas.core.sorting import safe_sort
1718
import pandas.util.testing as tm
1819
import pandas.util._test_decorators as td
1920
from pandas.compat import range, zip
@@ -1645,7 +1646,7 @@ def compare(self, result, expected):
16451646
result = result.dropna().values
16461647
expected = expected.dropna().values
16471648

1648-
tm.assert_numpy_array_equal(result, expected)
1649+
tm.assert_numpy_array_equal(result, expected, check_dtype=False)
16491650

16501651
@pytest.mark.parametrize('f', [lambda x: x.cov(), lambda x: x.corr()])
16511652
def test_no_flex(self, f):
@@ -1670,15 +1671,19 @@ def test_no_flex(self, f):
16701671
def test_pairwise_with_self(self, f):
16711672

16721673
# DataFrame with itself, pairwise=True
1673-
results = [f(df) for df in self.df1s]
1674-
for (df, result) in zip(self.df1s, results):
1674+
# note that we may construct the 1st level of the MI
1675+
# in a non-motononic way, so compare accordingly
1676+
results = []
1677+
for i, df in enumerate(self.df1s):
1678+
result = f(df)
16751679
tm.assert_index_equal(result.index.levels[0],
16761680
df.index,
16771681
check_names=False)
1678-
tm.assert_index_equal(result.index.levels[1],
1679-
df.columns,
1680-
check_names=False)
1682+
tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]),
1683+
safe_sort(df.columns.unique()))
16811684
tm.assert_index_equal(result.columns, df.columns)
1685+
results.append(df)
1686+
16821687
for i, result in enumerate(results):
16831688
if i > 0:
16841689
self.compare(result, results[0])
@@ -1716,9 +1721,8 @@ def test_pairwise_with_other(self, f):
17161721
tm.assert_index_equal(result.index.levels[0],
17171722
df.index,
17181723
check_names=False)
1719-
tm.assert_index_equal(result.index.levels[1],
1720-
self.df2.columns,
1721-
check_names=False)
1724+
tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]),
1725+
safe_sort(self.df2.columns.unique()))
17221726
for i, result in enumerate(results):
17231727
if i > 0:
17241728
self.compare(result, results[0])

0 commit comments

Comments
 (0)