ENH: Respect Key Ordering for OrderedDict List in DataFrame Init

gfyoung · jreback · commit d1916404bc60 · 2016-05-31T10:30:23.000-04:00
Title is self-explanatory. Closes #13304. Author: gfyoung <gfyoung17@gmail.com> Closes #13309 from gfyoung/ordereddict-key-ordering-init and squashes the following commits: 4f311cc [gfyoung] ENH: Respect key ordering for OrderedDict list in DataFrame init
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -88,6 +88,7 @@ Other enhancements
 - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
 - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)
 
+- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
 - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
 
 - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules.  New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5537,7 +5537,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
 def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
     if columns is None:
         gen = (list(x.keys()) for x in data)
-        columns = lib.fast_unique_multiple_list_gen(gen)
+        sort = not any(isinstance(d, OrderedDict) for d in data)
+        columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)
 
     # assure that they are of the base dict class and not of derived
     # classes
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -493,7 +493,21 @@ def fast_unique_multiple_list(list lists):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def fast_unique_multiple_list_gen(object gen):
+def fast_unique_multiple_list_gen(object gen, bint sort=True):
+    """
+    Generate a list of unique values from a generator of lists.
+
+    Parameters
+    ----------
+    gen : generator object
+        A generator of lists from which the unique list is created
+    sort : boolean
+        Whether or not to sort the resulting unique list
+
+    Returns
+    -------
+    unique_list : list of unique values
+    """
     cdef:
         list buf
         Py_ssize_t j, n
@@ -508,11 +522,11 @@ def fast_unique_multiple_list_gen(object gen):
             if val not in table:
                 table[val] = stub
                 uniques.append(val)
-
-    try:
-        uniques.sort()
-    except Exception:
-        pass
+    if sort:
+        try:
+            uniques.sort()
+        except Exception:
+            pass
 
     return uniques
 
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -891,6 +891,45 @@ def test_constructor_list_of_dicts(self):
         expected = DataFrame(index=[0])
         tm.assert_frame_equal(result, expected)
 
+    def test_constructor_ordered_dict_preserve_order(self):
+        # see gh-13304
+        expected = DataFrame([[2, 1]], columns=['b', 'a'])
+
+        data = OrderedDict()
+        data['b'] = [2]
+        data['a'] = [1]
+
+        result = DataFrame(data)
+        tm.assert_frame_equal(result, expected)
+
+        data = OrderedDict()
+        data['b'] = 2
+        data['a'] = 1
+
+        result = DataFrame([data])
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_ordered_dict_conflicting_orders(self):
+        # the first dict element sets the ordering for the DataFrame,
+        # even if there are conflicting orders from subsequent ones
+        row_one = OrderedDict()
+        row_one['b'] = 2
+        row_one['a'] = 1
+
+        row_two = OrderedDict()
+        row_two['a'] = 1
+        row_two['b'] = 2
+
+        row_three = {'b': 2, 'a': 1}
+
+        expected = DataFrame([[2, 1], [2, 1]], columns=['b', 'a'])
+        result = DataFrame([row_one, row_two])
+        tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame([[2, 1], [2, 1], [2, 1]], columns=['b', 'a'])
+        result = DataFrame([row_one, row_two, row_three])
+        tm.assert_frame_equal(result, expected)
+
     def test_constructor_list_of_series(self):
         data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]),
                 OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])]
@@ -1870,3 +1909,9 @@ def test_from_index(self):
         tm.assert_series_equal(df2[0], Series(idx2, name=0))
         df2 = DataFrame(Series(idx2))
         tm.assert_series_equal(df2[0], Series(idx2, name=0))
+
+if __name__ == '__main__':
+    import nose  # noqa
+
+    nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
+                   exit=False)
diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py
@@ -24,6 +24,19 @@ def test_max_len_string_array(self):
         tm.assertRaises(TypeError,
                         lambda: lib.max_len_string_array(arr.astype('U')))
 
+    def test_fast_unique_multiple_list_gen_sort(self):
+        keys = [['p', 'a'], ['n', 'd'], ['a', 's']]
+
+        gen = (key for key in keys)
+        expected = np.array(['a', 'd', 'n', 'p', 's'])
+        out = lib.fast_unique_multiple_list_gen(gen, sort=True)
+        tm.assert_numpy_array_equal(np.array(out), expected)
+
+        gen = (key for key in keys)
+        expected = np.array(['p', 'a', 'n', 'd', 's'])
+        out = lib.fast_unique_multiple_list_gen(gen, sort=False)
+        tm.assert_numpy_array_equal(np.array(out), expected)
+
 
 class TestIndexing(tm.TestCase):