ENH: Let initialisation from dicts use insertion order for python >= 3.6 (part III) (pandas-dev#19884)

topper-123 · jreback · commit e6c7dea15085 · 2018-03-02T06:19:07.000-05:00
diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst
@@ -81,9 +81,28 @@ index is passed, one will be created having values ``[0, ..., len(data) - 1]``.
 
 **From dict**
 
-If ``data`` is a dict, if **index** is passed the values in data corresponding
-to the labels in the index will be pulled out. Otherwise, an index will be
-constructed from the sorted keys of the dict, if possible.
+Series can be instantiated from dicts:
+
+.. ipython:: python
+
+   d = {'b' : 1, 'a' : 0, 'c' : 2}
+   pd.Series(d)
+
+.. note::
+
+   When the data is a dict, and an index is not passed, the ``Series`` index
+   will be ordered by the dict's insertion order, if you're using Python
+   version >= 3.6 and Pandas version >= 0.23.
+
+   If you're using Python < 3.6 or Pandas < 0.23, and an index is not passed,
+   the ``Series`` index will be the lexically ordered list of dict keys.
+
+In the example above, if you were on a Python version lower than 3.6 or a
+Pandas version lower than 0.23, the ``Series`` would be ordered by the lexical
+order of the dict keys (i.e. ``['a', 'b', 'c']`` rather than ``['b', 'a', 'c']``).
+
+If an index is passed, the values in data corresponding to the labels in the
+index will be pulled out.
 
 .. ipython:: python
 
@@ -243,12 +262,22 @@ not matching up to the passed index.
 If axis labels are not passed, they will be constructed from the input data
 based on common sense rules.
 
+.. note::
+
+   When the data is a dict, and ``columns`` is not specified, the ``DataFrame``
+   columns will be ordered by the dict's insertion order, if you are using
+   Python version >= 3.6 and Pandas >= 0.23.
+
+   If you are using Python < 3.6 or Pandas < 0.23, and ``columns`` is not
+   specified, the ``DataFrame`` columns will be the lexically ordered list of dict
+   keys.
+
 From dict of Series or dicts
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The resulting **index** will be the **union** of the indexes of the various
 Series. If there are any nested dicts, these will first be converted to
-Series. If no columns are passed, the columns will be the sorted list of dict
+Series. If no columns are passed, the columns will be the ordered list of dict
 keys.
 
 .. ipython:: python
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -3,7 +3,7 @@
 v0.23.0
 -------
 
-This is a major release from 0.21.1 and includes a number of API changes,
+This is a major release from 0.22.0 and includes a number of API changes,
 deprecations, new features, enhancements, and performance improvements along
 with a large number of bug fixes. We recommend that all users upgrade to this
 version.
@@ -249,7 +249,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python
   using ``.assign()`` to update an existing column. Previously, callables
   referring to other variables being updated would get the "old" values
 
-  Previous Behaviour:
+  Previous Behavior:
 
   .. code-block:: ipython
 
@@ -262,7 +262,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python
       1  3 -2
       2  4 -3
 
-  New Behaviour:
+  New Behavior:
 
   .. ipython:: python
 
@@ -361,6 +361,57 @@ If installed, we now require:
 | openpyxl        | 2.4.0           |          |
 +-----------------+-----------------+----------+
 
+.. _whatsnew_0230.api_breaking.dict_insertion_order:
+
+Instantation from dicts preserves dict insertion order for python 3.6+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Until Python 3.6, dicts in Python had no formally defined ordering. For Python
+version 3.6 and later, dicts are ordered by insertion order, see
+`PEP 468 <https://www.python.org/dev/peps/pep-0468/>`_.
+Pandas will use the dict's insertion order, when creating a ``Series`` or
+``DataFrame`` from a dict and you're using Python version 3.6 or
+higher. (:issue:`19884`)
+
+Previous Behavior (and current behavior if on Python < 3.6):
+
+.. code-block:: ipython
+
+   In [1]: pd.Series({'Income': 2000,
+   ...                'Expenses': -1500,
+   ...                'Taxes': -200,
+   ...                'Net result': 300})
+   Expenses     -1500
+   Income        2000
+   Net result     300
+   Taxes         -200
+   dtype: int64
+
+Note the Series above is ordered alphabetically by the index values.
+
+New Behavior (for Python >= 3.6):
+
+.. ipython:: python
+
+    pd.Series({'Income': 2000,
+               'Expenses': -1500,
+               'Taxes': -200,
+               'Net result': 300})
+
+Notice that the Series is now ordered by insertion order. This new behavior is
+used for all relevant pandas types (``Series``, ``DataFrame``, ``SparseSeries``
+and ``SparseDataFrame``).
+
+If you wish to retain the old behavior while using Python >= 3.6, you can use
+``.sort_index()``:
+
+.. ipython:: python
+
+    pd.Series({'Income': 2000,
+               'Expenses': -1500,
+               'Taxes': -200,
+               'Net result': 300}).sort_index()
+
 .. _whatsnew_0230.api_breaking.deprecate_panel:
 
 Deprecate Panel
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -11,7 +11,7 @@
 from pandas._libs import lib, tslib
 
 from pandas import compat
-from pandas.compat import long, zip, iteritems
+from pandas.compat import long, zip, iteritems, PY36, OrderedDict
 from pandas.core.config import get_option
 from pandas.core.dtypes.generic import ABCSeries, ABCIndex
 from pandas.core.dtypes.common import _NS_DTYPE
@@ -186,6 +186,16 @@ def _try_sort(iterable):
         return listed
 
 
+def _dict_keys_to_ordered_list(mapping):
+    # when pandas drops support for Python < 3.6, this function
+    # can be replaced by a simple list(mapping.keys())
+    if PY36 or isinstance(mapping, OrderedDict):
+        keys = list(mapping.keys())
+    else:
+        keys = _try_sort(mapping)
+    return keys
+
+
 def iterpairs(seq):
     """
     Parameters
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -252,6 +252,11 @@ class DataFrame(NDFrame):
     ----------
     data : numpy ndarray (structured or homogeneous), dict, or DataFrame
         Dict can contain Series, arrays, constants, or list-like objects
+
+        .. versionchanged :: 0.23.0
+           If data is a dict, argument order is maintained for Python 3.6
+           and later.
+
     index : Index or array-like
         Index to use for resulting frame. Will default to RangeIndex if
         no indexing information part of input data and no index provided
@@ -460,9 +465,7 @@ def _init_dict(self, data, index, columns, dtype=None):
                 arrays.append(v)
 
         else:
-            keys = list(data.keys())
-            if not isinstance(data, OrderedDict):
-                keys = com._try_sort(keys)
+            keys = com._dict_keys_to_ordered_list(data)
             columns = data_names = Index(keys)
             arrays = [data[k] for k in keys]
 
diff --git a/pandas/core/panel.py b/pandas/core/panel.py
@@ -204,10 +204,8 @@ def _init_dict(self, data, axes, dtype=None):
                                for k, v in compat.iteritems(data)
                                if k in haxis)
         else:
-            ks = list(data.keys())
-            if not isinstance(data, OrderedDict):
-                ks = com._try_sort(ks)
-            haxis = Index(ks)
+            keys = com._dict_keys_to_ordered_list(data)
+            haxis = Index(keys)
 
         for k, v in compat.iteritems(data):
             if isinstance(v, dict):
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -54,7 +54,7 @@
 from pandas import compat
 from pandas.io.formats.terminal import get_terminal_size
 from pandas.compat import (
-    zip, u, OrderedDict, StringIO, range, get_range_parameters)
+    zip, u, OrderedDict, StringIO, range, get_range_parameters, PY36)
 from pandas.compat.numpy import function as nv
 
 import pandas.core.ops as ops
@@ -130,6 +130,11 @@ class Series(base.IndexOpsMixin, generic.NDFrame):
     ----------
     data : array-like, dict, or scalar value
         Contains data stored in Series
+
+        .. versionchanged :: 0.23.0
+           If data is a dict, argument order is maintained for Python 3.6
+           and later.
+
     index : array-like or Index (1d)
         Values must be hashable and have the same length as `data`.
         Non-unique index values are allowed. Will default to
@@ -297,7 +302,7 @@ def _init_dict(self, data, index=None, dtype=None):
         # Now we just make sure the order is respected, if any
         if index is not None:
             s = s.reindex(index, copy=False)
-        elif not isinstance(data, OrderedDict):
+        elif not PY36 and not isinstance(data, OrderedDict):
             try:
                 s = s.sort_index()
             except TypeError:
diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
@@ -39,6 +39,10 @@ class SparseDataFrame(DataFrame):
     Parameters
     ----------
     data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
+        .. versionchanged :: 0.23.0
+           If data is a dict, argument order is maintained for Python 3.6
+           and later.
+
     index : array-like, optional
     column : array-like, optional
     default_kind : {'block', 'integer'}, default 'block'
@@ -138,7 +142,8 @@ def _init_dict(self, data, index, columns, dtype=None):
             columns = _ensure_index(columns)
             data = {k: v for k, v in compat.iteritems(data) if k in columns}
         else:
-            columns = Index(com._try_sort(list(data.keys())))
+            keys = com._dict_keys_to_ordered_list(data)
+            columns = Index(keys)
 
         if index is None:
             index = extract_index(list(data.values()))
diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py
@@ -42,6 +42,10 @@ class SparseSeries(Series):
     Parameters
     ----------
     data : {array-like, Series, SparseSeries, dict}
+        .. versionchanged :: 0.23.0
+           If data is a dict, argument order is maintained for Python 3.6
+           and later.
+
     kind : {'block', 'integer'}
     fill_value : float
         Code for missing value. Defaults depends on dtype.
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -15,7 +15,7 @@
 
 from pandas.core.dtypes.common import is_integer_dtype
 from pandas.compat import (lmap, long, zip, range, lrange, lzip,
-                           OrderedDict, is_platform_little_endian)
+                           OrderedDict, is_platform_little_endian, PY36)
 from pandas import compat
 from pandas import (DataFrame, Index, Series, isna,
                     MultiIndex, Timedelta, Timestamp,
@@ -290,6 +290,24 @@ def test_constructor_dict(self):
         with tm.assert_raises_regex(ValueError, msg):
             DataFrame({'a': 0.7}, columns=['b'])
 
+    @pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6')
+    def test_constructor_dict_order_insertion(self):
+        # GH19018
+        # initialization ordering: by insertion order if python>= 3.6
+        d = {'b': self.ts2, 'a': self.ts1}
+        frame = DataFrame(data=d)
+        expected = DataFrame(data=d, columns=list('ba'))
+        tm.assert_frame_equal(frame, expected)
+
+    @pytest.mark.skipif(PY36, reason='order by value for Python<3.6')
+    def test_constructor_dict_order_by_values(self):
+        # GH19018
+        # initialization ordering: by value if python<3.6
+        d = {'b': self.ts2, 'a': self.ts1}
+        frame = DataFrame(data=d)
+        expected = DataFrame(data=d, columns=list('ab'))
+        tm.assert_frame_equal(frame, expected)
+
     def test_constructor_multi_index(self):
         # GH 4078
         # construction error with mi and all-nan frame
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -762,17 +762,17 @@ def test_read_excel_multiindex_empty_level(self, ext):
         # GH 12453
         with ensure_clean('.xlsx') as path:
             df = DataFrame({
-                ('Zero', ''): {0: 0},
                 ('One', 'x'): {0: 1},
                 ('Two', 'X'): {0: 3},
-                ('Two', 'Y'): {0: 7}
+                ('Two', 'Y'): {0: 7},
+                ('Zero', ''): {0: 0}
             })
 
             expected = DataFrame({
-                ('Zero', 'Unnamed: 3_level_1'): {0: 0},
                 ('One', u'x'): {0: 1},
                 ('Two', u'X'): {0: 3},
-                ('Two', u'Y'): {0: 7}
+                ('Two', u'Y'): {0: 7},
+                ('Zero', 'Unnamed: 3_level_1'): {0: 0}
             })
 
             df.to_excel(path)
diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py
@@ -2034,7 +2034,7 @@ def test_table_values_dtypes_roundtrip(self):
                                'bool': 1, 'int16': 1, 'int8': 1,
                                'int64': 1, 'object': 1, 'datetime64[ns]': 2})
             result = result.sort_index()
-            result = expected.sort_index()
+            expected = expected.sort_index()
             tm.assert_series_equal(result, expected)
 
     def test_table_mixed_dtypes(self):
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -22,7 +22,7 @@
 from pandas._libs import lib
 from pandas._libs.tslib import iNaT
 
-from pandas.compat import lrange, range, zip, long
+from pandas.compat import lrange, range, zip, long, PY36
 from pandas.util.testing import assert_series_equal
 import pandas.util.testing as tm
 
@@ -811,6 +811,18 @@ def test_constructor_dict(self):
         expected.iloc[1] = 1
         assert_series_equal(result, expected)
 
+    def test_constructor_dict_order(self):
+        # GH19018
+        # initialization ordering: by insertion order if python>= 3.6, else
+        # order by value
+        d = {'b': 1, 'a': 0, 'c': 2}
+        result = Series(d)
+        if PY36:
+            expected = Series([1, 0, 2], index=list('bac'))
+        else:
+            expected = Series([0, 1, 2], index=list('abc'))
+        tm.assert_series_equal(result, expected)
+
     @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
     def test_constructor_dict_nan_key(self, value):
         # GH 18480
diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py
@@ -139,6 +139,18 @@ def test_constructor(self):
 
         repr(self.frame)
 
+    def test_constructor_dict_order(self):
+        # GH19018
+        # initialization ordering: by insertion order if python>= 3.6, else
+        # order by value
+        d = {'b': [2, 3], 'a': [0, 1]}
+        frame = SparseDataFrame(data=d)
+        if compat.PY36:
+            expected = SparseDataFrame(data=d, columns=list('ba'))
+        else:
+            expected = SparseDataFrame(data=d, columns=list('ab'))
+        tm.assert_sp_frame_equal(frame, expected)
+
     def test_constructor_ndarray(self):
         # no index or columns
         sp = SparseDataFrame(self.frame.values)
diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py
diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py