Skip to content

BUG: DataFrame(ndarray, dtype=categoricaldtype) #38857

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,8 @@ Categorical
^^^^^^^^^^^
- Bug in :class:`CategoricalIndex` incorrectly failing to raise ``TypeError`` when scalar data is passed (:issue:`38614`)
- Bug in ``CategoricalIndex.reindex`` failed when ``Index`` passed with elements all in category (:issue:`28690`)
- Bug where construcing a :class:`Categorical` from an object-dtype array of ``date`` objects did not round-trip correctly with ``astype`` (:issue:`38552`)

- Bug where constructing a :class:`Categorical` from an object-dtype array of ``date`` objects did not round-trip correctly with ``astype`` (:issue:`38552`)
- Bug in constructing a :class:`DataFrame` from an ``ndarray`` and a :class:`CategoricalDtype` (:issue:`38857`)

Datetimelike
^^^^^^^^^^^^
Expand Down
22 changes: 5 additions & 17 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
maybe_upcast,
)
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_datetime64tz_dtype,
is_dtype_equal,
is_extension_array_dtype,
Expand Down Expand Up @@ -160,21 +159,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
if not len(values) and columns is not None and len(columns):
values = np.empty((0, 1), dtype=object)

# we could have a categorical type passed or coerced to 'category'
# recast this to an arrays_to_mgr
if is_categorical_dtype(getattr(values, "dtype", None)) or is_categorical_dtype(
dtype
):

if not hasattr(values, "dtype"):
values = _prep_ndarray(values, copy=copy)
values = values.ravel()
elif copy:
values = values.copy()

index, columns = _get_axes(len(values), 1, index, columns)
return arrays_to_mgr([values], columns, index, columns, dtype=dtype)
elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype):
if is_extension_array_dtype(values) or is_extension_array_dtype(dtype):
# GH#19157

if isinstance(values, np.ndarray) and values.ndim > 1:
Expand Down Expand Up @@ -308,6 +293,7 @@ def nested_data_to_arrays(
if isinstance(data[0], ABCSeries):
index = _get_names_from_index(data)
elif isinstance(data[0], Categorical):
# GH#38845 hit in test_constructor_categorical
index = ibase.default_index(len(data[0]))
else:
index = ibase.default_index(len(data))
Expand Down Expand Up @@ -486,7 +472,9 @@ def _get_names_from_index(data):
return index


def _get_axes(N, K, index, columns) -> Tuple[Index, Index]:
def _get_axes(
N: int, K: int, index: Optional[Index], columns: Optional[Index]
) -> Tuple[Index, Index]:
# helper to create the axes as indexes
# return axes or defaults

Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1890,6 +1890,16 @@ def test_constructor_lists_to_object_dtype(self):
assert d["a"].dtype == np.object_
assert not d["a"][1]

def test_constructor_ndarray_categorical_dtype(self):
cat = Categorical(["A", "B", "C"])
arr = np.array(cat).reshape(-1, 1)
arr = np.broadcast_to(arr, (3, 4))

result = DataFrame(arr, dtype=cat.dtype)

expected = DataFrame({0: cat, 1: cat, 2: cat, 3: cat})
tm.assert_frame_equal(result, expected)

def test_constructor_categorical(self):

# GH8626
Expand All @@ -1913,11 +1923,13 @@ def test_constructor_categorical(self):
expected = Series(list("abc"), dtype="category", name=0)
tm.assert_series_equal(df[0], expected)

def test_construct_from_1item_list_of_categorical(self):
# ndim != 1
df = DataFrame([Categorical(list("abc"))])
expected = DataFrame({0: Series(list("abc"), dtype="category")})
tm.assert_frame_equal(df, expected)

def test_construct_from_list_of_categoricals(self):
df = DataFrame([Categorical(list("abc")), Categorical(list("abd"))])
expected = DataFrame(
{
Expand All @@ -1928,18 +1940,22 @@ def test_constructor_categorical(self):
)
tm.assert_frame_equal(df, expected)

def test_from_nested_listlike_mixed_types(self):
# mixed
df = DataFrame([Categorical(list("abc")), list("def")])
expected = DataFrame(
{0: Series(list("abc"), dtype="category"), 1: list("def")}, columns=[0, 1]
)
tm.assert_frame_equal(df, expected)

def test_construct_from_listlikes_mismatched_lengths(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

may pay to split out this test file as getting kind of big

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ive been thinking that myself. will address before long

# invalid (shape)
msg = r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)"
with pytest.raises(ValueError, match=msg):
DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))])

def test_categorical_1d_only(self):
# TODO: belongs in Categorical tests
# ndim > 1
msg = "> 1 ndim Categorical are not supported at this time"
with pytest.raises(NotImplementedError, match=msg):
Expand Down
9 changes: 6 additions & 3 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,13 +326,16 @@ def test_constructor_categorical(self):
expected = Series([1, 2, 3], dtype="int64")
tm.assert_series_equal(result, expected)

def test_construct_from_categorical_with_dtype(self):
# GH12574
cat = Series(Categorical([1, 2, 3]), dtype="category")
assert is_categorical_dtype(cat)
assert is_categorical_dtype(cat.dtype)
s = Series([1, 2, 3], dtype="category")
assert is_categorical_dtype(s)
assert is_categorical_dtype(s.dtype)

def test_construct_intlist_values_category_dtype(self):
ser = Series([1, 2, 3], dtype="category")
assert is_categorical_dtype(ser)
assert is_categorical_dtype(ser.dtype)

def test_constructor_categorical_with_coercion(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])
Expand Down