From 4f311cc64dde396edf5df0f27c509f5241cf90cf Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 28 May 2016 02:48:05 +0200 Subject: [PATCH] ENH: Respect key ordering for OrderedDict list in DataFrame init Closes gh-13304. --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/core/frame.py | 3 +- pandas/lib.pyx | 26 ++++++++++---- pandas/tests/frame/test_constructors.py | 45 +++++++++++++++++++++++++ pandas/tests/test_lib.py | 13 +++++++ 5 files changed, 81 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 2b67aca1dcf74..6102c5f41300f 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -87,6 +87,7 @@ Other enhancements - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) +- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2c8106571f198..69def7502a6f7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5537,7 +5537,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): if columns is None: gen = (list(x.keys()) for x in data) - columns = lib.fast_unique_multiple_list_gen(gen) + sort = not any(isinstance(d, OrderedDict) for d in data) + columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) # assure that they are of the base dict class and not of derived # classes diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 328166168a3fc..a9c7f93097f1b 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -493,7 +493,21 @@ def fast_unique_multiple_list(list lists): @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple_list_gen(object gen): +def fast_unique_multiple_list_gen(object gen, bint sort=True): + """ + Generate a list of unique values from a generator of lists. + + Parameters + ---------- + gen : generator object + A generator of lists from which the unique list is created + sort : boolean + Whether or not to sort the resulting unique list + + Returns + ------- + unique_list : list of unique values + """ cdef: list buf Py_ssize_t j, n @@ -508,11 +522,11 @@ def fast_unique_multiple_list_gen(object gen): if val not in table: table[val] = stub uniques.append(val) - - try: - uniques.sort() - except Exception: - pass + if sort: + try: + uniques.sort() + except Exception: + pass return uniques diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a050d74f0fc51..b42aef9447373 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -891,6 +891,45 @@ def test_constructor_list_of_dicts(self): expected = DataFrame(index=[0]) tm.assert_frame_equal(result, expected) + def test_constructor_ordered_dict_preserve_order(self): + # see gh-13304 + expected = DataFrame([[2, 1]], columns=['b', 'a']) + + data = OrderedDict() + data['b'] = [2] + data['a'] = [1] + + result = DataFrame(data) + tm.assert_frame_equal(result, expected) + + data = OrderedDict() + data['b'] = 2 + data['a'] = 1 + + result = DataFrame([data]) + tm.assert_frame_equal(result, expected) + + def test_constructor_ordered_dict_conflicting_orders(self): + # the first dict element sets the ordering for the DataFrame, + # even if there are conflicting orders from subsequent ones + row_one = OrderedDict() + row_one['b'] = 2 + row_one['a'] = 1 + + row_two = OrderedDict() + row_two['a'] = 1 + row_two['b'] = 2 + + row_three = {'b': 2, 'a': 1} + + expected = DataFrame([[2, 1], [2, 1]], columns=['b', 'a']) + result = DataFrame([row_one, row_two]) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[2, 1], [2, 1], [2, 1]], columns=['b', 'a']) + result = DataFrame([row_one, row_two, row_three]) + tm.assert_frame_equal(result, expected) + def test_constructor_list_of_series(self): data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]), OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])] @@ -1870,3 +1909,9 @@ def test_from_index(self): tm.assert_series_equal(df2[0], Series(idx2, name=0)) df2 = DataFrame(Series(idx2)) tm.assert_series_equal(df2[0], Series(idx2, name=0)) + +if __name__ == '__main__': + import nose # noqa + + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index bfac0aa83b434..10a6bb5c75b01 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -24,6 +24,19 @@ def test_max_len_string_array(self): tm.assertRaises(TypeError, lambda: lib.max_len_string_array(arr.astype('U'))) + def test_fast_unique_multiple_list_gen_sort(self): + keys = [['p', 'a'], ['n', 'd'], ['a', 's']] + + gen = (key for key in keys) + expected = np.array(['a', 'd', 'n', 'p', 's']) + out = lib.fast_unique_multiple_list_gen(gen, sort=True) + tm.assert_numpy_array_equal(np.array(out), expected) + + gen = (key for key in keys) + expected = np.array(['p', 'a', 'n', 'd', 's']) + out = lib.fast_unique_multiple_list_gen(gen, sort=False) + tm.assert_numpy_array_equal(np.array(out), expected) + class TestIndexing(tm.TestCase):