REF: Refactor DataFrame dict contstructor

TomAugspurger · TomAugspurger · commit 5036c4fdb792 · 2018-12-21T13:03:13.000-06:00
When passing a dict and `column=` to DataFrame, we previously passed the dict of {column: array} to the Series constructor. This eventually hit `construct_1d_object_array_from_listlike`[1]. For extension arrays, this ends up calling `ExtensionArray.__iter__`, iterating over the elements of the ExtensionArray, which is prohibiatively slow. We try to properly handle all the edge cases that we were papering over earlier by just passing the `data` to Series. We fix a bug or two along the way, but don't change any *tested* behavior, even if it looks fishy (e.g. pandas-dev#24385). [1]: pandas-dev#24368 (comment) Closes pandas-dev#24368 Closes pandas-dev#24386
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1401,10 +1401,11 @@ Numeric
 - Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`)
 - Checking PEP 3141 numbers in :func:`~pandas.api.types.is_scalar` function returns ``True`` (:issue:`22903`)
 
- Conversion
+Conversion
 ^^^^^^^^^^
 
 - Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`)
+- Bug in :meth:`DataFrame.__init__` when providing a ``dict`` data, ``columns`` that don't overlap with the keys in ``data``, and an integer ``dtype`` returning a DataFrame with floating-point values (:issue:`24386`)
 
 Strings
 ^^^^^^^
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -21,7 +21,8 @@
 from pandas.core.dtypes.common import (
     is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal,
     is_extension_array_dtype, is_extension_type, is_float_dtype,
-    is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype)
+    is_integer_dtype, is_iterator, is_list_like, is_object_dtype,
+    is_string_dtype, pandas_dtype)
 from pandas.core.dtypes.generic import (
     ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, ABCSeries,
     ABCTimedeltaIndex)
@@ -173,49 +174,108 @@ def init_dict(data, index, columns, dtype=None):
     """
     from pandas.core.series import Series
 
+    # Converting a dict of arrays to list of arrays sounds easy enough,
+    # right? Well, it's a bit more nuanced that that. Some problems:
+    # 1. Pandas allows missing values in the keys. If a user provides a dict
+    #    where the keys never compare equal (np.nan, pd.NaT, float('nan'))
+    #    we can't ever do a `data[key]`. So we *have* to iterate over the
+    #    key, value pairs of `data`, no way around it.
+    # 2. The key value pairs of `data` may have
+    #    1. A subset of the desired columns
+    #    2. A superset of the columns
+    #    3. Just the right columns
+    #    And may or may not be in the right order (or ordered, period).
+    #    So we need to get a mapping from `key in data -> position`.
+    # 3. Inconsistencies between the Series and DataFrame constructors
+    #    w.r.t. dtypes makes all for a lot of special casing later on.
     if columns is None:
         columns = list(data)
 
     if not isinstance(columns, Index):
         columns = Index(columns, copy=False)
 
-    if columns.nlevels > 1:
-        # MultiIndex.__iter__ may be incorrect for integer levels
-        # with some missing values. The integer values are cast to
-        # float. The upshot is that we can't look up keys from the
-        # dict below.
-        column_iter = (columns[i] for i in range(len(columns)))
+    if data:
+        normalized_keys = Index(data, copy=False)
+        positions = Series(columns.get_indexer_for(normalized_keys),
+                           index=normalized_keys)
     else:
-        column_iter = iter(columns)
-
-    new_data = type(data)()  # dict or OrderedDict
-    sentinel = object()
-
-    for key in column_iter:
-        # We use an object() sentinel for two reasons:
-        # 1. We avoid having to allocate the Series in each iteration
-        # 2. We can use data.get(key, None), since the user is allowed
-        #    to pass DataFrame({"A": None}, index=[...]), which is
-        #    different from DataFrame({"A": Series(None, index=[...])})
-        #    which is probably a bug.
-        val = data.get(key, sentinel)
-
-        if val is sentinel:
-            val = Series(index=index, dtype=dtype)
-        elif val is None:
-            val = Series([None] * len(index), index=index,
-                         dtype=dtype or object)
+        positions = Series()
+
+    new_data = {}
+    index_len = 0 if index is None else len(index)
+
+    for key, val in data.items():
+        position = positions[key]
+        if position < 0:
+            # Something like data={"A": [...]}, columns={"B"}
+            continue
         if (isinstance(val, ABCDatetimeIndex) and
                 data[key].tz is not None):
             # GH#24096 need copy to be deep for datetime64tz case
             # TODO: See if we can avoid these copies
-            val = val[key].copy(deep=True)
+            val = val.copy(deep=True)
 
-        new_data[key] = val
-
-    keys = com.dict_keys_to_ordered_list(new_data)
+        elif val is None:
+            # Users may provide scalars as keys. These are aligned to the
+            # correct shape to align with `index`. We would use the Series
+            # constructor, but Series(None, index=index) is converted to
+            # NaNs. In DataFrame,
+            # DataFrame({"A": None}, index=[1, 2], columns=["A"])
+            # is an array of Nones.
+            val = Series([None] * index_len, index=index,
+                         dtype=dtype or object)
 
-    arrays = [new_data[k] for k in keys]
+        elif index_len and lib.is_scalar(val):
+            val = Series(val, index=index, dtype=dtype)
+
+        new_data[position] = val
+
+    # OK, so user-provided columns in `data` taken care of. Let's move on to
+    # "extra" columns as defined by `columns`. First, we figure out the
+    # positions of the holes we're filling in.
+    extra_positions = np.arange(len(columns))
+    mask = np.isin(extra_positions, positions, invert=True)
+    extra_positions = extra_positions[mask]
+
+    # And now, what should the dtype of this new guys be? We'll that's a little
+    # tricky.
+    # 1. User provided dtype, just use that...
+    #    unless the user provided dtype=int and an index (Gh-24385)
+    # 2. Empty data.keys() & columns is object (unless specified by the user)
+    # 3. No data and No dtype is object (unless specified by the user).
+
+    # https://github.com/pandas-dev/pandas/issues/24385
+    # Series(None, dtype=int) and DataFrame(None, dtype=dtype)
+    # differ when the index is provided.
+    # But if dtype is not provided, then we fall use object.
+    # we have to pass this dtype through to arrays_to_mgr
+
+    # Some things I'd like to change
+    # With DataFrame(None, index=[1], columns=['a'], dtype=dtype):
+    #   For dtype=object, the result is object
+    #   But for dtype=int, the result is float
+    empty_columns = len(positions.index & columns) == 0
+
+    if empty_columns and dtype is None:
+        dtype = object
+    elif (index_len
+            and is_integer_dtype(dtype)):
+        # That's one complicated condition:
+        #  DataFrame(None, index=idx, columns=cols, dtype=int) must be float
+        #  DataFrame(None, index=idx, columns=cols, dtype=object) is object
+        #  DataFrame({'a': 2}, columns=['b']) is object (empty)
+        dtype = float
+    elif not data and dtype is None:
+        dtype = np.dtype('object')
+
+    for position in extra_positions:
+        new_data[position] = Series(index=index, dtype=dtype)
+
+    arrays = [new_data[i] for i in range(len(columns))]
+
+    # hrm this probably belongs in arrays_to_mgr...
+    if is_string_dtype(dtype) and not is_categorical_dtype(dtype):
+        dtype = np.dtype("object")
 
     return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
 
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -816,6 +816,18 @@ def test_constructor_dtype(self, data, index, columns, dtype, expected):
         df = DataFrame(data, index, columns, dtype)
         assert df.values.dtype == expected
 
+    @pytest.mark.parametrize('dtype', [
+        np.dtype("int64"),
+        np.dtype("float32"),
+        np.dtype("object"),
+        np.dtype("datetime64[ns]"),
+        "category"
+    ])
+    def test_constructor_dtype_non_overlapping_columns(self, dtype):
+        df = DataFrame({"A": [1, 2]}, columns=['B'], dtype=dtype)
+        result = df.dtypes['B']
+        assert result == dtype
+
     def test_constructor_scalar_inference(self):
         data = {'int': 1, 'bool': True,
                 'float': 3., 'complex': 4j, 'object': 'foo'}