Skip to content

REF: implement nested_data_to_arrays #38757

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 17 additions & 9 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,17 @@
from functools import partial
import operator
from shutil import get_terminal_size
from typing import Dict, Hashable, List, Sequence, Type, TypeVar, Union, cast
from typing import (
TYPE_CHECKING,
Dict,
Hashable,
List,
Sequence,
Type,
TypeVar,
Union,
cast,
)
from warnings import warn

import numpy as np
Expand Down Expand Up @@ -58,6 +68,10 @@

from pandas.io.formats import console

if TYPE_CHECKING:
from pandas import Index


CategoricalT = TypeVar("CategoricalT", bound="Categorical")


Expand Down Expand Up @@ -1708,13 +1722,7 @@ def fillna(self, value=None, method=None, limit=None):
mask = self.isna()

new_codes = self._validate_setitem_value(value)

if isinstance(value, (np.ndarray, Categorical)):
# We get ndarray or Categorical if called via Series.fillna,
# where it will unwrap another aligned Series before getting here
codes[mask] = new_codes[mask]
else:
codes[mask] = new_codes
np.putmask(codes, mask, new_codes)

return self._from_backing_data(codes)

Expand Down Expand Up @@ -2510,7 +2518,7 @@ def _delegate_method(self, name, *args, **kwargs):
# utility routines


def _get_codes_for_values(values, categories) -> np.ndarray:
def _get_codes_for_values(values, categories: "Index") -> np.ndarray:
"""
utility routine to turn values into codes given the specified categories

Expand Down
27 changes: 8 additions & 19 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@
is_integer_dtype,
is_iterator,
is_list_like,
is_named_tuple,
is_object_dtype,
is_scalar,
is_sequence,
Expand All @@ -129,7 +128,7 @@
transform,
)
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.construction import extract_array, sanitize_masked_array
from pandas.core.generic import NDFrame, _shared_docs
Expand All @@ -147,13 +146,14 @@
from pandas.core.internals.construction import (
arrays_to_mgr,
dataclasses_to_dicts,
get_names_from_index,
init_dict,
init_ndarray,
masked_rec_array_to_mgr,
nested_data_to_arrays,
reorder_arrays,
sanitize_index,
to_arrays,
treat_as_nested,
)
from pandas.core.reshape.melt import melt
from pandas.core.series import Series
Expand Down Expand Up @@ -565,27 +565,16 @@ def __init__(
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)

# For data is list-like, or Iterable (will consume into list)
elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
elif is_list_like(data):
if not isinstance(data, (abc.Sequence, ExtensionArray)):
data = list(data)
if len(data) > 0:
if is_dataclass(data[0]):
data = dataclasses_to_dicts(data)
if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields
arrays, columns = to_arrays(data, columns, dtype=dtype)
columns = ensure_index(columns)

# set the index
if index is None:
if isinstance(data[0], Series):
index = get_names_from_index(data)
elif isinstance(data[0], Categorical):
index = ibase.default_index(len(data[0]))
else:
index = ibase.default_index(len(data))

if treat_as_nested(data):
arrays, columns, index = nested_data_to_arrays(
data, columns, index, dtype
)
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
else:
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
Expand Down
40 changes: 38 additions & 2 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
is_extension_array_dtype,
is_integer_dtype,
is_list_like,
is_named_tuple,
is_object_dtype,
)
from pandas.core.dtypes.generic import (
Expand Down Expand Up @@ -106,7 +107,7 @@ def masked_rec_array_to_mgr(
# essentially process a record array then fill it
fdata = ma.getdata(data)
if index is None:
index = get_names_from_index(fdata)
index = _get_names_from_index(fdata)
if index is None:
index = ibase.default_index(len(data))
index = ensure_index(index)
Expand Down Expand Up @@ -286,6 +287,41 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)


def nested_data_to_arrays(
data: Sequence,
columns: Optional[Index],
index: Optional[Index],
dtype: Optional[DtypeObj],
):
"""
Convert a single sequence of arrays to multiple arrays.
"""
# By the time we get here we have already checked treat_as_nested(data)

if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields

arrays, columns = to_arrays(data, columns, dtype=dtype)
columns = ensure_index(columns)

if index is None:
if isinstance(data[0], ABCSeries):
index = _get_names_from_index(data)
elif isinstance(data[0], Categorical):
index = ibase.default_index(len(data[0]))
else:
index = ibase.default_index(len(data))

return arrays, columns, index


def treat_as_nested(data) -> bool:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could type data as Sequence at a later point

"""
Check if we should use nested_data_to_arrays.
"""
return len(data) > 0 and is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1


# ---------------------------------------------------------------------


Expand Down Expand Up @@ -432,7 +468,7 @@ def reorder_arrays(arrays, arr_columns, columns):
return arrays, arr_columns


def get_names_from_index(data):
def _get_names_from_index(data):
has_some_name = any(getattr(s, "name", None) is not None for s in data)
if not has_some_name:
return ibase.default_index(len(data))
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/series/apply/test_series_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -778,10 +778,14 @@ def test_map_missing_mixed(self, vals, mapping, exp):
),
],
)
def test_apply_series_on_date_time_index_aware_series(self, dti, exp):
@pytest.mark.parametrize("aware", [True, False])
def test_apply_series_on_date_time_index_aware_series(self, dti, exp, aware):
# GH 25959
# Calling apply on a localized time series should not cause an error
index = dti.tz_localize("UTC").index
if aware:
index = dti.tz_localize("UTC").index
else:
index = dti.index
result = Series(index).apply(lambda x: Series([1, 2]))
tm.assert_frame_equal(result, exp)

Expand Down