Skip to content

Commit 2dd80c8

Browse files
BUG: DataFrame fail to construct when data is list and columns is nested list for MI (#32202)
1 parent 522461f commit 2dd80c8

File tree

3 files changed

+123
-18
lines changed

3 files changed

+123
-18
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,7 @@ Other
471471
instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`)
472472
- Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`)
473473
- Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`)
474+
- Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`)
474475
- Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`)
475476
- Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`).
476477
- :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`)

pandas/core/internals/construction.py

+96-18
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,13 @@
33
constructors before passing them to a BlockManager.
44
"""
55
from collections import abc
6-
from typing import Tuple
6+
from typing import Dict, List, Optional, Tuple, Union
77

88
import numpy as np
99
import numpy.ma as ma
1010

1111
from pandas._libs import lib
12+
from pandas._typing import Axis, Dtype, Scalar
1213

1314
from pandas.core.dtypes.cast import (
1415
construct_1d_arraylike_from_scalar,
@@ -522,29 +523,38 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
522523
return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
523524

524525

525-
def _list_to_arrays(data, columns, coerce_float=False, dtype=None):
526+
def _list_to_arrays(
527+
data: List[Scalar],
528+
columns: Union[Index, List],
529+
coerce_float: bool = False,
530+
dtype: Optional[Dtype] = None,
531+
) -> Tuple[List[Scalar], Union[Index, List[Axis]]]:
526532
if len(data) > 0 and isinstance(data[0], tuple):
527533
content = list(lib.to_object_array_tuples(data).T)
528534
else:
529535
# list of lists
530536
content = list(lib.to_object_array(data).T)
531537
# gh-26429 do not raise user-facing AssertionError
532538
try:
533-
result = _convert_object_array(
534-
content, columns, dtype=dtype, coerce_float=coerce_float
535-
)
539+
columns = _validate_or_indexify_columns(content, columns)
540+
result = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
536541
except AssertionError as e:
537542
raise ValueError(e) from e
538-
return result
543+
return result, columns
539544

540545

541-
def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
546+
def _list_of_series_to_arrays(
547+
data: List,
548+
columns: Union[Index, List],
549+
coerce_float: bool = False,
550+
dtype: Optional[Dtype] = None,
551+
) -> Tuple[List[Scalar], Union[Index, List[Axis]]]:
542552
if columns is None:
543553
# We know pass_data is non-empty because data[0] is a Series
544554
pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))]
545555
columns = get_objs_combined_axis(pass_data, sort=False)
546556

547-
indexer_cache = {}
557+
indexer_cache: Dict[int, Scalar] = {}
548558

549559
aligned_values = []
550560
for s in data:
@@ -564,14 +574,19 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
564574

565575
if values.dtype == np.object_:
566576
content = list(values.T)
567-
return _convert_object_array(
568-
content, columns, dtype=dtype, coerce_float=coerce_float
569-
)
577+
columns = _validate_or_indexify_columns(content, columns)
578+
content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
579+
return content, columns
570580
else:
571581
return values.T, columns
572582

573583

574-
def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
584+
def _list_of_dict_to_arrays(
585+
data: List,
586+
columns: Union[Index, List],
587+
coerce_float: bool = False,
588+
dtype: Optional[Dtype] = None,
589+
) -> Tuple[List[Scalar], Union[Index, List[Axis]]]:
575590
"""
576591
Convert list of dicts to numpy arrays
577592
@@ -603,22 +618,85 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
603618
data = [(type(d) is dict) and d or dict(d) for d in data]
604619

605620
content = list(lib.dicts_to_array(data, list(columns)).T)
606-
return _convert_object_array(
607-
content, columns, dtype=dtype, coerce_float=coerce_float
608-
)
621+
columns = _validate_or_indexify_columns(content, columns)
622+
content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
623+
return content, columns
609624

610625

611-
def _convert_object_array(content, columns, coerce_float=False, dtype=None):
626+
def _validate_or_indexify_columns(
627+
content: List, columns: Union[Index, List, None]
628+
) -> Union[Index, List[Axis]]:
629+
"""
630+
If columns is None, make numbers as column names; Otherwise, validate that
631+
columns have valid length.
632+
633+
Parameters
634+
----------
635+
content: list of data
636+
columns: Iterable or None
637+
638+
Returns
639+
-------
640+
columns: If columns is Iterable, return as is; If columns is None, assign
641+
positional column index value as columns.
642+
643+
Raises
644+
------
645+
1. AssertionError when content is not composed of list of lists, and if
646+
length of columns is not equal to length of content.
647+
2. ValueError when content is list of lists, but length of each sub-list
648+
is not equal
649+
3. ValueError when content is list of lists, but length of sub-list is
650+
not equal to length of content
651+
"""
612652
if columns is None:
613653
columns = ibase.default_index(len(content))
614654
else:
615-
if len(columns) != len(content): # pragma: no cover
655+
656+
# Add mask for data which is composed of list of lists
657+
is_mi_list = isinstance(columns, list) and all(
658+
isinstance(col, list) for col in columns
659+
)
660+
661+
if not is_mi_list and len(columns) != len(content): # pragma: no cover
616662
# caller's responsibility to check for this...
617663
raise AssertionError(
618664
f"{len(columns)} columns passed, passed data had "
619665
f"{len(content)} columns"
620666
)
667+
elif is_mi_list:
668+
669+
# check if nested list column, length of each sub-list should be equal
670+
if len({len(col) for col in columns}) > 1:
671+
raise ValueError(
672+
"Length of columns passed for MultiIndex columns is different"
673+
)
674+
675+
# if columns is not empty and length of sublist is not equal to content
676+
elif columns and len(columns[0]) != len(content):
677+
raise ValueError(
678+
f"{len(columns[0])} columns passed, passed data had "
679+
f"{len(content)} columns"
680+
)
681+
return columns
682+
683+
684+
def _convert_object_array(
685+
content: List[Scalar], coerce_float: bool = False, dtype: Optional[Dtype] = None
686+
) -> List[Scalar]:
687+
"""
688+
Internal function ot convert object array.
689+
690+
Parameters
691+
----------
692+
content: list of processed data records
693+
coerce_float: bool, to coerce floats or not, default is False
694+
dtype: np.dtype, default is None
621695
696+
Returns
697+
-------
698+
arrays: casted content if not object dtype, otherwise return as is in list.
699+
"""
622700
# provide soft conversion of object dtypes
623701
def convert(arr):
624702
if dtype != object and dtype != np.object:
@@ -628,7 +706,7 @@ def convert(arr):
628706

629707
arrays = [convert(arr) for arr in content]
630708

631-
return arrays, columns
709+
return arrays
632710

633711

634712
# ---------------------------------------------------------------------

pandas/tests/frame/test_constructors.py

+26
Original file line numberDiff line numberDiff line change
@@ -1063,6 +1063,32 @@ def test_constructor_list_of_lists(self):
10631063
result = DataFrame(data)
10641064
tm.assert_frame_equal(result, expected)
10651065

1066+
def test_constructor_list_like_data_nested_list_column(self):
1067+
# GH 32173
1068+
arrays = [list("abcd"), list("cdef")]
1069+
result = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)
1070+
1071+
mi = MultiIndex.from_arrays(arrays)
1072+
expected = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=mi)
1073+
1074+
tm.assert_frame_equal(result, expected)
1075+
1076+
def test_constructor_wrong_length_nested_list_column(self):
1077+
# GH 32173
1078+
arrays = [list("abc"), list("cde")]
1079+
1080+
msg = "3 columns passed, passed data had 4"
1081+
with pytest.raises(ValueError, match=msg):
1082+
DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)
1083+
1084+
def test_constructor_unequal_length_nested_list_column(self):
1085+
# GH 32173
1086+
arrays = [list("abcd"), list("cde")]
1087+
1088+
msg = "Length of columns passed for MultiIndex columns is different"
1089+
with pytest.raises(ValueError, match=msg):
1090+
DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)
1091+
10661092
def test_constructor_sequence_like(self):
10671093
# GH 3783
10681094
# collections.Squence like

0 commit comments

Comments
 (0)