Merge pull request #7738 from seth-p/flex_binary_moment_column_order

jreback · jreback · commit 99b7c8c065c9 · 2014-07-25T10:34:07.000-04:00
BUG: _flex_binary_moment() doesn't preserve column order or handle multiple columns with the same label
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -265,7 +265,7 @@ Bug Fixes
 
 
 
-- Bug in repeated timeseries line and area plot may result in ``ValueError`` or  incorrect kind (:issue:`7733`)
+- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`)
 
 
 
@@ -278,7 +278,10 @@ Bug Fixes
 - Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`)
 - Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`)
 
-
+- Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, ``rolling_cov``, ``ewmcov``, and ``ewmcorr``
+  returning results with columns sorted by name and producing an error for non-unique columns;
+  now handles non-unique columns and returns columns in original order
+  (except for the case of two DataFrames with ``pairwise=False``, where behavior is unchanged) (:issue:`7542`)
 
 
 
diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py
@@ -259,38 +259,55 @@ def _flex_binary_moment(arg1, arg2, f, pairwise=False):
             isinstance(arg2, (np.ndarray,Series)):
         X, Y = _prep_binary(arg1, arg2)
         return f(X, Y)
+
     elif isinstance(arg1, DataFrame):
+        def dataframe_from_int_dict(data, frame_template):
+            result = DataFrame(data, index=frame_template.index)
+            result.columns = frame_template.columns[result.columns]
+            return result
+
         results = {}
         if isinstance(arg2, DataFrame):
-            X, Y = arg1.align(arg2, join='outer')
             if pairwise is False:
-                X = X + 0 * Y
-                Y = Y + 0 * X
-                res_columns = arg1.columns.union(arg2.columns)
-                for col in res_columns:
-                    if col in X and col in Y:
-                        results[col] = f(X[col], Y[col])
+                if arg1 is arg2:
+                    # special case in order to handle duplicate column names
+                    for i, col in enumerate(arg1.columns):
+                        results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i])
+                    return dataframe_from_int_dict(results, arg1)
+                else:
+                    if not arg1.columns.is_unique:
+                        raise ValueError("'arg1' columns are not unique")
+                    if not arg2.columns.is_unique:
+                        raise ValueError("'arg2' columns are not unique")
+                    X, Y = arg1.align(arg2, join='outer')
+                    X = X + 0 * Y
+                    Y = Y + 0 * X
+                    res_columns = arg1.columns.union(arg2.columns)
+                    for col in res_columns:
+                        if col in X and col in Y:
+                            results[col] = f(X[col], Y[col])
+                    return DataFrame(results, index=X.index, columns=res_columns)
             elif pairwise is True:
                 results = defaultdict(dict)
                 for i, k1 in enumerate(arg1.columns):
                     for j, k2 in enumerate(arg2.columns):
                         if j<i and arg2 is arg1:
                             # Symmetric case
-                            results[k1][k2] = results[k2][k1]
+                            results[i][j] = results[j][i]
                         else:
-                            results[k1][k2] = f(*_prep_binary(arg1[k1], arg2[k2]))
-                return Panel.from_dict(results).swapaxes('items', 'major')
+                            results[i][j] = f(*_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]))
+                p = Panel.from_dict(results).swapaxes('items', 'major')
+                p.major_axis = arg1.columns[p.major_axis]
+                p.minor_axis = arg2.columns[p.minor_axis]
+                return p
             else:
                 raise ValueError("'pairwise' is not True/False")
         else:
-            res_columns = arg1.columns
-            X, Y = arg1.align(arg2, axis=0, join='outer')
             results = {}
+            for i, col in enumerate(arg1.columns):
+                results[i] = f(*_prep_binary(arg1.iloc[:, i], arg2))
+            return dataframe_from_int_dict(results, arg1)
 
-            for col in res_columns:
-                results[col] = f(X[col], Y)
-
-        return DataFrame(results, index=X.index, columns=res_columns)
     else:
         return _flex_binary_moment(arg2, arg1, f)
 
diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py
@@ -8,7 +8,7 @@
 
 from pandas import Series, DataFrame, Panel, bdate_range, isnull, notnull
 from pandas.util.testing import (
-    assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal
+    assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_index_equal
 )
 import pandas.core.datetools as datetools
 import pandas.stats.moments as mom
@@ -970,6 +970,119 @@ def test_expanding_corr_pairwise_diff_length(self):
         assert_frame_equal(result2, expected)
         assert_frame_equal(result3, expected)
         assert_frame_equal(result4, expected)
+    
+    def test_pairwise_stats_column_names_order(self):
+        # GH 7738
+        df1s = [DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0,1]),
+                DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[1,0]),
+                DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[1,1]),
+                DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=['C','C']),
+                DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[1.,0]),
+                DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0.,1]),
+                DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=['C',1]),
+                DataFrame([[2.,4.],[1.,2.],[5.,2.],[8.,1.]], columns=[1,0.]),
+                DataFrame([[2,4.],[1,2.],[5,2.],[8,1.]], columns=[0,1.]),
+                DataFrame([[2,4],[1,2],[5,2],[8,1.]], columns=[1.,'X']),
+               ]
+        df2 = DataFrame([[None,1,1],[None,1,2],[None,3,2],[None,8,1]], columns=['Y','Z','X'])
+        s = Series([1,1,3,8])
+
+        # DataFrame methods (which do not call _flex_binary_moment())
+        for f in [lambda x: x.cov(),
+                  lambda x: x.corr(),
+                 ]:
+            results = [f(df) for df in df1s]
+            for (df, result) in zip(df1s, results):
+                assert_index_equal(result.index, df.columns)
+                assert_index_equal(result.columns, df.columns)
+            for i, result in enumerate(results):
+                if i > 0:
+                    self.assert_numpy_array_equivalent(result, results[0])
+
+        # DataFrame with itself, pairwise=True
+        for f in [lambda x: mom.expanding_cov(x, pairwise=True),
+                  lambda x: mom.expanding_corr(x, pairwise=True),
+                  lambda x: mom.rolling_cov(x, window=3, pairwise=True),
+                  lambda x: mom.rolling_corr(x, window=3, pairwise=True),
+                  lambda x: mom.ewmcov(x, com=3, pairwise=True),
+                  lambda x: mom.ewmcorr(x, com=3, pairwise=True),
+                 ]:
+            results = [f(df) for df in df1s]
+            for (df, result) in zip(df1s, results):
+                assert_index_equal(result.items, df.index)
+                assert_index_equal(result.major_axis, df.columns)
+                assert_index_equal(result.minor_axis, df.columns)
+            for i, result in enumerate(results):
+                if i > 0:
+                    self.assert_numpy_array_equivalent(result, results[0])
+
+        # DataFrame with itself, pairwise=False
+        for f in [lambda x: mom.expanding_cov(x, pairwise=False),
+                  lambda x: mom.expanding_corr(x, pairwise=False),
+                  lambda x: mom.rolling_cov(x, window=3, pairwise=False),
+                  lambda x: mom.rolling_corr(x, window=3, pairwise=False),
+                  lambda x: mom.ewmcov(x, com=3, pairwise=False),
+                  lambda x: mom.ewmcorr(x, com=3, pairwise=False),
+                 ]:
+            results = [f(df) for df in df1s]
+            for (df, result) in zip(df1s, results):
+                assert_index_equal(result.index, df.index)
+                assert_index_equal(result.columns, df.columns)
+            for i, result in enumerate(results):
+                if i > 0:
+                    self.assert_numpy_array_equivalent(result, results[0])
+
+        # DataFrame with another DataFrame, pairwise=True
+        for f in [lambda x, y: mom.expanding_cov(x, y, pairwise=True),
+                  lambda x, y: mom.expanding_corr(x, y, pairwise=True),
+                  lambda x, y: mom.rolling_cov(x, y, window=3, pairwise=True),
+                  lambda x, y: mom.rolling_corr(x, y, window=3, pairwise=True),
+                  lambda x, y: mom.ewmcov(x, y, com=3, pairwise=True),
+                  lambda x, y: mom.ewmcorr(x, y, com=3, pairwise=True),
+                 ]:
+            results = [f(df, df2) for df in df1s]
+            for (df, result) in zip(df1s, results):
+                assert_index_equal(result.items, df.index)
+                assert_index_equal(result.major_axis, df.columns)
+                assert_index_equal(result.minor_axis, df2.columns)
+            for i, result in enumerate(results):
+                if i > 0:
+                    self.assert_numpy_array_equivalent(result, results[0])
+
+        # DataFrame with another DataFrame, pairwise=False
+        for f in [lambda x, y: mom.expanding_cov(x, y, pairwise=False),
+                  lambda x, y: mom.expanding_corr(x, y, pairwise=False),
+                  lambda x, y: mom.rolling_cov(x, y, window=3, pairwise=False),
+                  lambda x, y: mom.rolling_corr(x, y, window=3, pairwise=False),
+                  lambda x, y: mom.ewmcov(x, y, com=3, pairwise=False),
+                  lambda x, y: mom.ewmcorr(x, y, com=3, pairwise=False),
+                 ]:
+            results = [f(df, df2) if df.columns.is_unique else None for df in df1s]
+            for (df, result) in zip(df1s, results):
+                if result is not None:
+                    expected_index = df.index.union(df2.index)
+                    expected_columns = df.columns.union(df2.columns)
+                    assert_index_equal(result.index, expected_index)
+                    assert_index_equal(result.columns, expected_columns)
+                else:
+                    tm.assertRaisesRegexp(ValueError, "'arg1' columns are not unique", f, df, df2)
+                    tm.assertRaisesRegexp(ValueError, "'arg2' columns are not unique", f, df2, df)
+
+        # DataFrame with a Series
+        for f in [lambda x, y: mom.expanding_cov(x, y),
+                  lambda x, y: mom.expanding_corr(x, y),
+                  lambda x, y: mom.rolling_cov(x, y, window=3),
+                  lambda x, y: mom.rolling_corr(x, y, window=3),
+                  lambda x, y: mom.ewmcov(x, y, com=3),
+                  lambda x, y: mom.ewmcorr(x, y, com=3),
+                 ]:
+            results = [f(df, s) for df in df1s] + [f(s, df) for df in df1s]
+            for (df, result) in zip(df1s, results):
+                assert_index_equal(result.index, df.index)
+                assert_index_equal(result.columns, df.columns)
+            for i, result in enumerate(results):
+                if i > 0:
+                    self.assert_numpy_array_equivalent(result, results[0])
 
     def test_rolling_skew_edge_cases(self):