Skip to content

ENH: Respect Key Ordering for OrderedDict List in DataFrame Init #13309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ Other enhancements
- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)

- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)

- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`)
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5537,7 +5537,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
if columns is None:
gen = (list(x.keys()) for x in data)
columns = lib.fast_unique_multiple_list_gen(gen)
sort = not any(isinstance(d, OrderedDict) for d in data)
columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)

# assure that they are of the base dict class and not of derived
# classes
Expand Down
26 changes: 20 additions & 6 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,21 @@ def fast_unique_multiple_list(list lists):

@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple_list_gen(object gen):
def fast_unique_multiple_list_gen(object gen, bint sort=True):
"""
Generate a list of unique values from a generator of lists.

Parameters
----------
gen : generator object
A generator of lists from which the unique list is created
sort : boolean
Whether or not to sort the resulting unique list

Returns
-------
unique_list : list of unique values
"""
cdef:
list buf
Py_ssize_t j, n
Expand All @@ -508,11 +522,11 @@ def fast_unique_multiple_list_gen(object gen):
if val not in table:
table[val] = stub
uniques.append(val)

try:
uniques.sort()
except Exception:
pass
if sort:
try:
uniques.sort()
except Exception:
pass

return uniques

Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,6 +891,45 @@ def test_constructor_list_of_dicts(self):
expected = DataFrame(index=[0])
tm.assert_frame_equal(result, expected)

def test_constructor_ordered_dict_preserve_order(self):
# see gh-13304
expected = DataFrame([[2, 1]], columns=['b', 'a'])

data = OrderedDict()
data['b'] = [2]
data['a'] = [1]

result = DataFrame(data)
tm.assert_frame_equal(result, expected)

data = OrderedDict()
data['b'] = 2
data['a'] = 1

result = DataFrame([data])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

like this, but for example have a dict and OrderedDict in a list

tm.assert_frame_equal(result, expected)

def test_constructor_ordered_dict_conflicting_orders(self):
# the first dict element sets the ordering for the DataFrame,
# even if there are conflicting orders from subsequent ones
row_one = OrderedDict()
row_one['b'] = 2
row_one['a'] = 1

row_two = OrderedDict()
row_two['a'] = 1
row_two['b'] = 2

row_three = {'b': 2, 'a': 1}

expected = DataFrame([[2, 1], [2, 1]], columns=['b', 'a'])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this hit your code path? e.g. columns is not None here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That statement does not because no dict objects are being passed in.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

still need a test for a mix of dict/OrderedDIct being passed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that's my last test (see below)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh ok

result = DataFrame([row_one, row_two])
tm.assert_frame_equal(result, expected)

expected = DataFrame([[2, 1], [2, 1], [2, 1]], columns=['b', 'a'])
result = DataFrame([row_one, row_two, row_three])
tm.assert_frame_equal(result, expected)

def test_constructor_list_of_series(self):
data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]),
OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])]
Expand Down Expand Up @@ -1870,3 +1909,9 @@ def test_from_index(self):
tm.assert_series_equal(df2[0], Series(idx2, name=0))
df2 = DataFrame(Series(idx2))
tm.assert_series_equal(df2[0], Series(idx2, name=0))

if __name__ == '__main__':
import nose # noqa

nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
13 changes: 13 additions & 0 deletions pandas/tests/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,19 @@ def test_max_len_string_array(self):
tm.assertRaises(TypeError,
lambda: lib.max_len_string_array(arr.astype('U')))

def test_fast_unique_multiple_list_gen_sort(self):
keys = [['p', 'a'], ['n', 'd'], ['a', 's']]

gen = (key for key in keys)
expected = np.array(['a', 'd', 'n', 'p', 's'])
out = lib.fast_unique_multiple_list_gen(gen, sort=True)
tm.assert_numpy_array_equal(np.array(out), expected)

gen = (key for key in keys)
expected = np.array(['p', 'a', 'n', 'd', 's'])
out = lib.fast_unique_multiple_list_gen(gen, sort=False)
tm.assert_numpy_array_equal(np.array(out), expected)


class TestIndexing(tm.TestCase):

Expand Down