Skip to content

BUG: DataFrame fail to construct when data is list and columns is nested list for MI #32202

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 44 commits into from
Apr 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
7e461a1
remove \n from docstring
charlesdong1991 Dec 3, 2018
1314059
fix conflicts
charlesdong1991 Jan 19, 2019
8bcb313
Merge remote-tracking branch 'upstream/master'
charlesdong1991 Jul 30, 2019
24c3ede
Merge remote-tracking branch 'upstream/master'
charlesdong1991 Jan 14, 2020
dea38f2
fix issue 17038
charlesdong1991 Jan 14, 2020
cd9e7ac
revert change
charlesdong1991 Jan 14, 2020
e5e912b
revert change
charlesdong1991 Jan 14, 2020
e609188
Merge remote-tracking branch 'upstream/master' into fix_issue_32173
charlesdong1991 Feb 23, 2020
c8ee822
fix 32173
charlesdong1991 Feb 23, 2020
07ffde2
linting
charlesdong1991 Feb 23, 2020
b3f3da0
linting
charlesdong1991 Feb 23, 2020
2f2054c
add whatsnew
charlesdong1991 Feb 23, 2020
9176389
fix linting
charlesdong1991 Feb 23, 2020
ed02384
Merge remote-tracking branch 'upstream/master' into fix_issue_32173
charlesdong1991 Feb 23, 2020
d3b0c50
rebase and resolve conflict
charlesdong1991 Mar 4, 2020
a5e0d10
separate out column validation
charlesdong1991 Mar 4, 2020
6073ed7
code change based on JR review
charlesdong1991 Mar 4, 2020
e8f6d67
fixup
charlesdong1991 Mar 4, 2020
559b5d6
fixup
charlesdong1991 Mar 4, 2020
a452816
Merge remote-tracking branch 'upstream/master' into fix_issue_32173
charlesdong1991 Mar 5, 2020
3fd6743
rebase and fix conflict
charlesdong1991 Mar 15, 2020
2428edb
add docs and annotation
charlesdong1991 Mar 15, 2020
fe18e50
black
charlesdong1991 Mar 15, 2020
a5d159b
Add more docs
charlesdong1991 Mar 15, 2020
7516964
add annotation
charlesdong1991 Mar 15, 2020
86bd699
add for dict
charlesdong1991 Mar 15, 2020
30a70a7
remove unused import
charlesdong1991 Mar 15, 2020
a6cb139
Merge remote-tracking branch 'upstream/master' into fix_issue_32173
charlesdong1991 Apr 2, 2020
ed6dc4a
improve annotation
charlesdong1991 Apr 2, 2020
f058d2c
fix annotation
charlesdong1991 Apr 2, 2020
2852579
fix annotation
charlesdong1991 Apr 2, 2020
493ac33
fixup
charlesdong1991 Apr 2, 2020
3ecd6b8
more details
charlesdong1991 Apr 3, 2020
851a3e1
isort
charlesdong1991 Apr 3, 2020
9860985
better annotation
charlesdong1991 Apr 3, 2020
ffc6561
removed unused import
charlesdong1991 Apr 3, 2020
a028c33
linting
charlesdong1991 Apr 3, 2020
5af0f8e
code change on JB review
charlesdong1991 Apr 3, 2020
9eda16a
fixup
charlesdong1991 Apr 3, 2020
35e92ea
Merge remote-tracking branch 'upstream/master' into fix_issue_32173
charlesdong1991 Apr 3, 2020
44af738
fix conflicts
charlesdong1991 Apr 4, 2020
d0a10e4
Merge remote-tracking branch 'upstream/master' into fix_issue_32173
charlesdong1991 Apr 5, 2020
1700e5d
Merge remote-tracking branch 'upstream/master' into fix_issue_32173
charlesdong1991 Apr 6, 2020
8a036e4
Merge remote-tracking branch 'upstream/master' into fix_issue_32173
charlesdong1991 Apr 6, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,7 @@ Other
instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`)
- Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`)
- Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`)
- Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`)
- Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`)
- Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`).
- :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`)
Expand Down
114 changes: 96 additions & 18 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
constructors before passing them to a BlockManager.
"""
from collections import abc
from typing import Tuple
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import numpy.ma as ma

from pandas._libs import lib
from pandas._typing import Axis, Dtype, Scalar

from pandas.core.dtypes.cast import (
construct_1d_arraylike_from_scalar,
Expand Down Expand Up @@ -522,29 +523,38 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)


def _list_to_arrays(data, columns, coerce_float=False, dtype=None):
def _list_to_arrays(
data: List[Scalar],
columns: Union[Index, List],
coerce_float: bool = False,
dtype: Optional[Dtype] = None,
) -> Tuple[List[Scalar], Union[Index, List[Axis]]]:
if len(data) > 0 and isinstance(data[0], tuple):
content = list(lib.to_object_array_tuples(data).T)
else:
# list of lists
content = list(lib.to_object_array(data).T)
# gh-26429 do not raise user-facing AssertionError
try:
result = _convert_object_array(
content, columns, dtype=dtype, coerce_float=coerce_float
)
columns = _validate_or_indexify_columns(content, columns)
result = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
except AssertionError as e:
raise ValueError(e) from e
return result
return result, columns


def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
def _list_of_series_to_arrays(
data: List,
columns: Union[Index, List],
coerce_float: bool = False,
dtype: Optional[Dtype] = None,
) -> Tuple[List[Scalar], Union[Index, List[Axis]]]:
if columns is None:
# We know pass_data is non-empty because data[0] is a Series
pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))]
columns = get_objs_combined_axis(pass_data, sort=False)

indexer_cache = {}
indexer_cache: Dict[int, Scalar] = {}

aligned_values = []
for s in data:
Expand All @@ -564,14 +574,19 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):

if values.dtype == np.object_:
content = list(values.T)
return _convert_object_array(
content, columns, dtype=dtype, coerce_float=coerce_float
)
columns = _validate_or_indexify_columns(content, columns)
content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
return content, columns
else:
return values.T, columns


def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
def _list_of_dict_to_arrays(
data: List,
columns: Union[Index, List],
coerce_float: bool = False,
dtype: Optional[Dtype] = None,
) -> Tuple[List[Scalar], Union[Index, List[Axis]]]:
"""
Convert list of dicts to numpy arrays

Expand Down Expand Up @@ -603,22 +618,85 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
data = [(type(d) is dict) and d or dict(d) for d in data]

content = list(lib.dicts_to_array(data, list(columns)).T)
return _convert_object_array(
content, columns, dtype=dtype, coerce_float=coerce_float
)
columns = _validate_or_indexify_columns(content, columns)
content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
return content, columns


def _convert_object_array(content, columns, coerce_float=False, dtype=None):
def _validate_or_indexify_columns(
content: List, columns: Union[Index, List, None]
) -> Union[Index, List[Axis]]:
"""
If columns is None, make numbers as column names; Otherwise, validate that
columns have valid length.

Parameters
----------
content: list of data
columns: Iterable or None

Returns
-------
columns: If columns is Iterable, return as is; If columns is None, assign
positional column index value as columns.

Raises
------
1. AssertionError when content is not composed of list of lists, and if
length of columns is not equal to length of content.
2. ValueError when content is list of lists, but length of each sub-list
is not equal
3. ValueError when content is list of lists, but length of sub-list is
not equal to length of content
"""
if columns is None:
columns = ibase.default_index(len(content))
else:
if len(columns) != len(content): # pragma: no cover

# Add mask for data which is composed of list of lists
is_mi_list = isinstance(columns, list) and all(
isinstance(col, list) for col in columns
)

if not is_mi_list and len(columns) != len(content): # pragma: no cover
# caller's responsibility to check for this...
raise AssertionError(
f"{len(columns)} columns passed, passed data had "
f"{len(content)} columns"
)
elif is_mi_list:

# check if nested list column, length of each sub-list should be equal
if len({len(col) for col in columns}) > 1:
raise ValueError(
"Length of columns passed for MultiIndex columns is different"
)

# if columns is not empty and length of sublist is not equal to content
elif columns and len(columns[0]) != len(content):
raise ValueError(
f"{len(columns[0])} columns passed, passed data had "
f"{len(content)} columns"
)
return columns


def _convert_object_array(
content: List[Scalar], coerce_float: bool = False, dtype: Optional[Dtype] = None
) -> List[Scalar]:
"""
Internal function ot convert object array.

Parameters
----------
content: list of processed data records
coerce_float: bool, to coerce floats or not, default is False
dtype: np.dtype, default is None

Returns
-------
arrays: casted content if not object dtype, otherwise return as is in list.
"""
# provide soft conversion of object dtypes
def convert(arr):
if dtype != object and dtype != np.object:
Expand All @@ -628,7 +706,7 @@ def convert(arr):

arrays = [convert(arr) for arr in content]

return arrays, columns
return arrays


# ---------------------------------------------------------------------
Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1063,6 +1063,32 @@ def test_constructor_list_of_lists(self):
result = DataFrame(data)
tm.assert_frame_equal(result, expected)

def test_constructor_list_like_data_nested_list_column(self):
# GH 32173
arrays = [list("abcd"), list("cdef")]
result = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)

mi = MultiIndex.from_arrays(arrays)
expected = pd.DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=mi)

tm.assert_frame_equal(result, expected)

def test_constructor_wrong_length_nested_list_column(self):
# GH 32173
arrays = [list("abc"), list("cde")]

msg = "3 columns passed, passed data had 4"
with pytest.raises(ValueError, match=msg):
DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)

def test_constructor_unequal_length_nested_list_column(self):
# GH 32173
arrays = [list("abcd"), list("cde")]

msg = "Length of columns passed for MultiIndex columns is different"
with pytest.raises(ValueError, match=msg):
DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)

def test_constructor_sequence_like(self):
# GH 3783
# collections.Squence like
Expand Down