pandas-dev · jreback · Apr 6, 2020 · Dec 3, 2018 · Jan 19, 2019 · Jul 30, 2019
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -471,6 +471,7 @@ Other
   instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`)
 - Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`)
 - Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`)
+- Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`)
 - Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`)
 - Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`).
 - :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`)

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -3,12 +3,13 @@
 constructors before passing them to a BlockManager.
 """
 from collections import abc
-from typing import Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import numpy.ma as ma
 
 from pandas._libs import lib
+from pandas._typing import Axis, Dtype, Scalar
 
 from pandas.core.dtypes.cast import (
     construct_1d_arraylike_from_scalar,
@@ -522,29 +523,38 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
         return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
 
 
-def _list_to_arrays(data, columns, coerce_float=False, dtype=None):
+def _list_to_arrays(
+    data: List[Scalar],
+    columns: Union[Index, List],
+    coerce_float: bool = False,
+    dtype: Optional[Dtype] = None,
+) -> Tuple[List[Scalar], Union[Index, List[Axis]]]:
     if len(data) > 0 and isinstance(data[0], tuple):
         content = list(lib.to_object_array_tuples(data).T)
     else:
         # list of lists
         content = list(lib.to_object_array(data).T)
     # gh-26429 do not raise user-facing AssertionError
     try:
-        result = _convert_object_array(
-            content, columns, dtype=dtype, coerce_float=coerce_float
-        )
+        columns = _validate_or_indexify_columns(content, columns)
+        result = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
     except AssertionError as e:
         raise ValueError(e) from e
-    return result
+    return result, columns
 
 
-def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
+def _list_of_series_to_arrays(
+    data: List,
+    columns: Union[Index, List],
+    coerce_float: bool = False,
+    dtype: Optional[Dtype] = None,
+) -> Tuple[List[Scalar], Union[Index, List[Axis]]]:
     if columns is None:
         # We know pass_data is non-empty because data[0] is a Series
         pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))]
         columns = get_objs_combined_axis(pass_data, sort=False)
 
-    indexer_cache = {}
+    indexer_cache: Dict[int, Scalar] = {}
 
     aligned_values = []
     for s in data:
@@ -564,14 +574,19 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
 
     if values.dtype == np.object_:
         content = list(values.T)
-        return _convert_object_array(
-            content, columns, dtype=dtype, coerce_float=coerce_float
-        )
+        columns = _validate_or_indexify_columns(content, columns)
+        content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
+        return content, columns
     else:
         return values.T, columns
 
 
-def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
+def _list_of_dict_to_arrays(
+    data: List,
+    columns: Union[Index, List],
+    coerce_float: bool = False,
+    dtype: Optional[Dtype] = None,
+) -> Tuple[List[Scalar], Union[Index, List[Axis]]]:
     """
     Convert list of dicts to numpy arrays
 
@@ -603,22 +618,85 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
     data = [(type(d) is dict) and d or dict(d) for d in data]
 
     content = list(lib.dicts_to_array(data, list(columns)).T)
-    return _convert_object_array(
-        content, columns, dtype=dtype, coerce_float=coerce_float
-    )
+    columns = _validate_or_indexify_columns(content, columns)
+    content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
+    return content, columns
 
 
-def _convert_object_array(content, columns, coerce_float=False, dtype=None):
+def _validate_or_indexify_columns(
+    content: List, columns: Union[Index, List, None]
+) -> Union[Index, List[Axis]]:
+    """
+    If columns is None, make numbers as column names; Otherwise, validate that
+    columns have valid length.
+
+    Parameters
+    ----------
+    content: list of data
+    columns: Iterable or None
+
+    Returns
+    -------
+    columns: If columns is Iterable, return as is; If columns is None, assign
+    positional column index value as columns.
+
+    Raises
+    ------
+    1. AssertionError when content is not composed of list of lists, and if
+        length of columns is not equal to length of content.
+    2. ValueError when content is list of lists, but length of each sub-list
+        is not equal
+    3. ValueError when content is list of lists, but length of sub-list is
+        not equal to length of content
+    """
     if columns is None:
         columns = ibase.default_index(len(content))
     else:
-        if len(columns) != len(content):  # pragma: no cover
+
+        # Add mask for data which is composed of list of lists
+        is_mi_list = isinstance(columns, list) and all(
+            isinstance(col, list) for col in columns
+        )
+
+        if not is_mi_list and len(columns) != len(content):  # pragma: no cover
             # caller's responsibility to check for this...
             raise AssertionError(
                 f"{len(columns)} columns passed, passed data had "
                 f"{len(content)} columns"
             )
+        elif is_mi_list:
+
+            # check if nested list column, length of each sub-list should be equal
+            if len({len(col) for col in columns}) > 1:
+                raise ValueError(
+                    "Length of columns passed for MultiIndex columns is different"
+                )
+
+            # if columns is not empty and length of sublist is not equal to content
+            elif columns and len(columns[0]) != len(content):
+                raise ValueError(
+                    f"{len(columns[0])} columns passed, passed data had "
+                    f"{len(content)} columns"
+                )
+    return columns
+
+
+def _convert_object_array(
+    content: List[Scalar], coerce_float: bool = False, dtype: Optional[Dtype] = None
+) -> List[Scalar]:
+    """
+    Internal function ot convert object array.
+
+    Parameters
+    ----------
+    content: list of processed data records
+    coerce_float: bool, to coerce floats or not, default is False
+    dtype: np.dtype, default is None
 
+    Returns
+    -------
+    arrays: casted content if not object dtype, otherwise return as is in list.
+    """
     # provide soft conversion of object dtypes
     def convert(arr):
         if dtype != object and dtype != np.object:
@@ -628,7 +706,7 @@ def convert(arr):
 
     arrays = [convert(arr) for arr in content]
 
-    return arrays, columns
+    return arrays
 
 
 # ---------------------------------------------------------------------

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -1063,6 +1063,32 @@ def test_constructor_list_of_lists(self):
         result = DataFrame(data)
         tm.assert_frame_equal(result, expected)
 
+    def test_constructor_list_like_data_nested_list_column(self):
+        # GH 32173
+        arrays = [list("abcd"), list("cdef")]
+        result = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)
+
+        mi = MultiIndex.from_arrays(arrays)
+        expected = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=mi)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_wrong_length_nested_list_column(self):
+        # GH 32173
+        arrays = [list("abc"), list("cde")]
+
+        msg = "3 columns passed, passed data had 4"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)
+
+    def test_constructor_unequal_length_nested_list_column(self):
+        # GH 32173
+        arrays = [list("abcd"), list("cde")]
+
+        msg = "Length of columns passed for MultiIndex columns is different"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)
+
     def test_constructor_sequence_like(self):
         # GH 3783
         # collections.Squence like