Skip to content

Commit d191640

Browse files
gfyoungjreback
authored andcommitted
ENH: Respect Key Ordering for OrderedDict List in DataFrame Init
Title is self-explanatory. Closes #13304. Author: gfyoung <[email protected]> Closes #13309 from gfyoung/ordereddict-key-ordering-init and squashes the following commits: 4f311cc [gfyoung] ENH: Respect key ordering for OrderedDict list in DataFrame init
1 parent 132c1c5 commit d191640

File tree

5 files changed

+81
-7
lines changed

5 files changed

+81
-7
lines changed

doc/source/whatsnew/v0.18.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ Other enhancements
8888
- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
8989
- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)
9090

91+
- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
9192
- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
9293

9394
- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`)

pandas/core/frame.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -5537,7 +5537,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
55375537
def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
55385538
if columns is None:
55395539
gen = (list(x.keys()) for x in data)
5540-
columns = lib.fast_unique_multiple_list_gen(gen)
5540+
sort = not any(isinstance(d, OrderedDict) for d in data)
5541+
columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)
55415542

55425543
# assure that they are of the base dict class and not of derived
55435544
# classes

pandas/lib.pyx

+20-6
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,21 @@ def fast_unique_multiple_list(list lists):
493493

494494
@cython.wraparound(False)
495495
@cython.boundscheck(False)
496-
def fast_unique_multiple_list_gen(object gen):
496+
def fast_unique_multiple_list_gen(object gen, bint sort=True):
497+
"""
498+
Generate a list of unique values from a generator of lists.
499+
500+
Parameters
501+
----------
502+
gen : generator object
503+
A generator of lists from which the unique list is created
504+
sort : boolean
505+
Whether or not to sort the resulting unique list
506+
507+
Returns
508+
-------
509+
unique_list : list of unique values
510+
"""
497511
cdef:
498512
list buf
499513
Py_ssize_t j, n
@@ -508,11 +522,11 @@ def fast_unique_multiple_list_gen(object gen):
508522
if val not in table:
509523
table[val] = stub
510524
uniques.append(val)
511-
512-
try:
513-
uniques.sort()
514-
except Exception:
515-
pass
525+
if sort:
526+
try:
527+
uniques.sort()
528+
except Exception:
529+
pass
516530

517531
return uniques
518532

pandas/tests/frame/test_constructors.py

+45
Original file line numberDiff line numberDiff line change
@@ -891,6 +891,45 @@ def test_constructor_list_of_dicts(self):
891891
expected = DataFrame(index=[0])
892892
tm.assert_frame_equal(result, expected)
893893

894+
def test_constructor_ordered_dict_preserve_order(self):
895+
# see gh-13304
896+
expected = DataFrame([[2, 1]], columns=['b', 'a'])
897+
898+
data = OrderedDict()
899+
data['b'] = [2]
900+
data['a'] = [1]
901+
902+
result = DataFrame(data)
903+
tm.assert_frame_equal(result, expected)
904+
905+
data = OrderedDict()
906+
data['b'] = 2
907+
data['a'] = 1
908+
909+
result = DataFrame([data])
910+
tm.assert_frame_equal(result, expected)
911+
912+
def test_constructor_ordered_dict_conflicting_orders(self):
913+
# the first dict element sets the ordering for the DataFrame,
914+
# even if there are conflicting orders from subsequent ones
915+
row_one = OrderedDict()
916+
row_one['b'] = 2
917+
row_one['a'] = 1
918+
919+
row_two = OrderedDict()
920+
row_two['a'] = 1
921+
row_two['b'] = 2
922+
923+
row_three = {'b': 2, 'a': 1}
924+
925+
expected = DataFrame([[2, 1], [2, 1]], columns=['b', 'a'])
926+
result = DataFrame([row_one, row_two])
927+
tm.assert_frame_equal(result, expected)
928+
929+
expected = DataFrame([[2, 1], [2, 1], [2, 1]], columns=['b', 'a'])
930+
result = DataFrame([row_one, row_two, row_three])
931+
tm.assert_frame_equal(result, expected)
932+
894933
def test_constructor_list_of_series(self):
895934
data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]),
896935
OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])]
@@ -1870,3 +1909,9 @@ def test_from_index(self):
18701909
tm.assert_series_equal(df2[0], Series(idx2, name=0))
18711910
df2 = DataFrame(Series(idx2))
18721911
tm.assert_series_equal(df2[0], Series(idx2, name=0))
1912+
1913+
if __name__ == '__main__':
1914+
import nose # noqa
1915+
1916+
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
1917+
exit=False)

pandas/tests/test_lib.py

+13
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,19 @@ def test_max_len_string_array(self):
2424
tm.assertRaises(TypeError,
2525
lambda: lib.max_len_string_array(arr.astype('U')))
2626

27+
def test_fast_unique_multiple_list_gen_sort(self):
28+
keys = [['p', 'a'], ['n', 'd'], ['a', 's']]
29+
30+
gen = (key for key in keys)
31+
expected = np.array(['a', 'd', 'n', 'p', 's'])
32+
out = lib.fast_unique_multiple_list_gen(gen, sort=True)
33+
tm.assert_numpy_array_equal(np.array(out), expected)
34+
35+
gen = (key for key in keys)
36+
expected = np.array(['p', 'a', 'n', 'd', 's'])
37+
out = lib.fast_unique_multiple_list_gen(gen, sort=False)
38+
tm.assert_numpy_array_equal(np.array(out), expected)
39+
2740

2841
class TestIndexing(tm.TestCase):
2942

0 commit comments

Comments
 (0)