Skip to content

Commit 99b7c8c

Browse files
committed
Merge pull request #7738 from seth-p/flex_binary_moment_column_order
BUG: _flex_binary_moment() doesn't preserve column order or handle multiple columns with the same label
2 parents 6d4a686 + 34d2910 commit 99b7c8c

File tree

3 files changed

+152
-19
lines changed

3 files changed

+152
-19
lines changed

doc/source/v0.15.0.txt

+5-2
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ Bug Fixes
265265

266266

267267

268-
- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`)
268+
- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`)
269269

270270

271271

@@ -278,7 +278,10 @@ Bug Fixes
278278
- Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`)
279279
- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`)
280280

281-
281+
- Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, ``rolling_cov``, ``ewmcov``, and ``ewmcorr``
282+
returning results with columns sorted by name and producing an error for non-unique columns;
283+
now handles non-unique columns and returns columns in original order
284+
(except for the case of two DataFrames with ``pairwise=False``, where behavior is unchanged) (:issue:`7542`)
282285

283286

284287

pandas/stats/moments.py

+33-16
Original file line numberDiff line numberDiff line change
@@ -259,38 +259,55 @@ def _flex_binary_moment(arg1, arg2, f, pairwise=False):
259259
isinstance(arg2, (np.ndarray,Series)):
260260
X, Y = _prep_binary(arg1, arg2)
261261
return f(X, Y)
262+
262263
elif isinstance(arg1, DataFrame):
264+
def dataframe_from_int_dict(data, frame_template):
265+
result = DataFrame(data, index=frame_template.index)
266+
result.columns = frame_template.columns[result.columns]
267+
return result
268+
263269
results = {}
264270
if isinstance(arg2, DataFrame):
265-
X, Y = arg1.align(arg2, join='outer')
266271
if pairwise is False:
267-
X = X + 0 * Y
268-
Y = Y + 0 * X
269-
res_columns = arg1.columns.union(arg2.columns)
270-
for col in res_columns:
271-
if col in X and col in Y:
272-
results[col] = f(X[col], Y[col])
272+
if arg1 is arg2:
273+
# special case in order to handle duplicate column names
274+
for i, col in enumerate(arg1.columns):
275+
results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i])
276+
return dataframe_from_int_dict(results, arg1)
277+
else:
278+
if not arg1.columns.is_unique:
279+
raise ValueError("'arg1' columns are not unique")
280+
if not arg2.columns.is_unique:
281+
raise ValueError("'arg2' columns are not unique")
282+
X, Y = arg1.align(arg2, join='outer')
283+
X = X + 0 * Y
284+
Y = Y + 0 * X
285+
res_columns = arg1.columns.union(arg2.columns)
286+
for col in res_columns:
287+
if col in X and col in Y:
288+
results[col] = f(X[col], Y[col])
289+
return DataFrame(results, index=X.index, columns=res_columns)
273290
elif pairwise is True:
274291
results = defaultdict(dict)
275292
for i, k1 in enumerate(arg1.columns):
276293
for j, k2 in enumerate(arg2.columns):
277294
if j<i and arg2 is arg1:
278295
# Symmetric case
279-
results[k1][k2] = results[k2][k1]
296+
results[i][j] = results[j][i]
280297
else:
281-
results[k1][k2] = f(*_prep_binary(arg1[k1], arg2[k2]))
282-
return Panel.from_dict(results).swapaxes('items', 'major')
298+
results[i][j] = f(*_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]))
299+
p = Panel.from_dict(results).swapaxes('items', 'major')
300+
p.major_axis = arg1.columns[p.major_axis]
301+
p.minor_axis = arg2.columns[p.minor_axis]
302+
return p
283303
else:
284304
raise ValueError("'pairwise' is not True/False")
285305
else:
286-
res_columns = arg1.columns
287-
X, Y = arg1.align(arg2, axis=0, join='outer')
288306
results = {}
307+
for i, col in enumerate(arg1.columns):
308+
results[i] = f(*_prep_binary(arg1.iloc[:, i], arg2))
309+
return dataframe_from_int_dict(results, arg1)
289310

290-
for col in res_columns:
291-
results[col] = f(X[col], Y)
292-
293-
return DataFrame(results, index=X.index, columns=res_columns)
294311
else:
295312
return _flex_binary_moment(arg2, arg1, f)
296313

pandas/stats/tests/test_moments.py

+114-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from pandas import Series, DataFrame, Panel, bdate_range, isnull, notnull
1010
from pandas.util.testing import (
11-
assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal
11+
assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_index_equal
1212
)
1313
import pandas.core.datetools as datetools
1414
import pandas.stats.moments as mom
@@ -970,6 +970,119 @@ def test_expanding_corr_pairwise_diff_length(self):
970970
assert_frame_equal(result2, expected)
971971
assert_frame_equal(result3, expected)
972972
assert_frame_equal(result4, expected)
973+
974+
def test_pairwise_stats_column_names_order(self):
975+
# GH 7738
976+
df1s = [DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0,1]),
977+
DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[1,0]),
978+
DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[1,1]),
979+
DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=['C','C']),
980+
DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[1.,0]),
981+
DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0.,1]),
982+
DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=['C',1]),
983+
DataFrame([[2.,4.],[1.,2.],[5.,2.],[8.,1.]], columns=[1,0.]),
984+
DataFrame([[2,4.],[1,2.],[5,2.],[8,1.]], columns=[0,1.]),
985+
DataFrame([[2,4],[1,2],[5,2],[8,1.]], columns=[1.,'X']),
986+
]
987+
df2 = DataFrame([[None,1,1],[None,1,2],[None,3,2],[None,8,1]], columns=['Y','Z','X'])
988+
s = Series([1,1,3,8])
989+
990+
# DataFrame methods (which do not call _flex_binary_moment())
991+
for f in [lambda x: x.cov(),
992+
lambda x: x.corr(),
993+
]:
994+
results = [f(df) for df in df1s]
995+
for (df, result) in zip(df1s, results):
996+
assert_index_equal(result.index, df.columns)
997+
assert_index_equal(result.columns, df.columns)
998+
for i, result in enumerate(results):
999+
if i > 0:
1000+
self.assert_numpy_array_equivalent(result, results[0])
1001+
1002+
# DataFrame with itself, pairwise=True
1003+
for f in [lambda x: mom.expanding_cov(x, pairwise=True),
1004+
lambda x: mom.expanding_corr(x, pairwise=True),
1005+
lambda x: mom.rolling_cov(x, window=3, pairwise=True),
1006+
lambda x: mom.rolling_corr(x, window=3, pairwise=True),
1007+
lambda x: mom.ewmcov(x, com=3, pairwise=True),
1008+
lambda x: mom.ewmcorr(x, com=3, pairwise=True),
1009+
]:
1010+
results = [f(df) for df in df1s]
1011+
for (df, result) in zip(df1s, results):
1012+
assert_index_equal(result.items, df.index)
1013+
assert_index_equal(result.major_axis, df.columns)
1014+
assert_index_equal(result.minor_axis, df.columns)
1015+
for i, result in enumerate(results):
1016+
if i > 0:
1017+
self.assert_numpy_array_equivalent(result, results[0])
1018+
1019+
# DataFrame with itself, pairwise=False
1020+
for f in [lambda x: mom.expanding_cov(x, pairwise=False),
1021+
lambda x: mom.expanding_corr(x, pairwise=False),
1022+
lambda x: mom.rolling_cov(x, window=3, pairwise=False),
1023+
lambda x: mom.rolling_corr(x, window=3, pairwise=False),
1024+
lambda x: mom.ewmcov(x, com=3, pairwise=False),
1025+
lambda x: mom.ewmcorr(x, com=3, pairwise=False),
1026+
]:
1027+
results = [f(df) for df in df1s]
1028+
for (df, result) in zip(df1s, results):
1029+
assert_index_equal(result.index, df.index)
1030+
assert_index_equal(result.columns, df.columns)
1031+
for i, result in enumerate(results):
1032+
if i > 0:
1033+
self.assert_numpy_array_equivalent(result, results[0])
1034+
1035+
# DataFrame with another DataFrame, pairwise=True
1036+
for f in [lambda x, y: mom.expanding_cov(x, y, pairwise=True),
1037+
lambda x, y: mom.expanding_corr(x, y, pairwise=True),
1038+
lambda x, y: mom.rolling_cov(x, y, window=3, pairwise=True),
1039+
lambda x, y: mom.rolling_corr(x, y, window=3, pairwise=True),
1040+
lambda x, y: mom.ewmcov(x, y, com=3, pairwise=True),
1041+
lambda x, y: mom.ewmcorr(x, y, com=3, pairwise=True),
1042+
]:
1043+
results = [f(df, df2) for df in df1s]
1044+
for (df, result) in zip(df1s, results):
1045+
assert_index_equal(result.items, df.index)
1046+
assert_index_equal(result.major_axis, df.columns)
1047+
assert_index_equal(result.minor_axis, df2.columns)
1048+
for i, result in enumerate(results):
1049+
if i > 0:
1050+
self.assert_numpy_array_equivalent(result, results[0])
1051+
1052+
# DataFrame with another DataFrame, pairwise=False
1053+
for f in [lambda x, y: mom.expanding_cov(x, y, pairwise=False),
1054+
lambda x, y: mom.expanding_corr(x, y, pairwise=False),
1055+
lambda x, y: mom.rolling_cov(x, y, window=3, pairwise=False),
1056+
lambda x, y: mom.rolling_corr(x, y, window=3, pairwise=False),
1057+
lambda x, y: mom.ewmcov(x, y, com=3, pairwise=False),
1058+
lambda x, y: mom.ewmcorr(x, y, com=3, pairwise=False),
1059+
]:
1060+
results = [f(df, df2) if df.columns.is_unique else None for df in df1s]
1061+
for (df, result) in zip(df1s, results):
1062+
if result is not None:
1063+
expected_index = df.index.union(df2.index)
1064+
expected_columns = df.columns.union(df2.columns)
1065+
assert_index_equal(result.index, expected_index)
1066+
assert_index_equal(result.columns, expected_columns)
1067+
else:
1068+
tm.assertRaisesRegexp(ValueError, "'arg1' columns are not unique", f, df, df2)
1069+
tm.assertRaisesRegexp(ValueError, "'arg2' columns are not unique", f, df2, df)
1070+
1071+
# DataFrame with a Series
1072+
for f in [lambda x, y: mom.expanding_cov(x, y),
1073+
lambda x, y: mom.expanding_corr(x, y),
1074+
lambda x, y: mom.rolling_cov(x, y, window=3),
1075+
lambda x, y: mom.rolling_corr(x, y, window=3),
1076+
lambda x, y: mom.ewmcov(x, y, com=3),
1077+
lambda x, y: mom.ewmcorr(x, y, com=3),
1078+
]:
1079+
results = [f(df, s) for df in df1s] + [f(s, df) for df in df1s]
1080+
for (df, result) in zip(df1s, results):
1081+
assert_index_equal(result.index, df.index)
1082+
assert_index_equal(result.columns, df.columns)
1083+
for i, result in enumerate(results):
1084+
if i > 0:
1085+
self.assert_numpy_array_equivalent(result, results[0])
9731086

9741087
def test_rolling_skew_edge_cases(self):
9751088

0 commit comments

Comments
 (0)