Skip to content

Commit 31e0743

Browse files
authored
REF: implement nested_data_to_arrays (#38757)
1 parent 8ff5c42 commit 31e0743

File tree

4 files changed

+69
-32
lines changed

4 files changed

+69
-32
lines changed

pandas/core/arrays/categorical.py

+17-9
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,17 @@
22
from functools import partial
33
import operator
44
from shutil import get_terminal_size
5-
from typing import Dict, Hashable, List, Sequence, Type, TypeVar, Union, cast
5+
from typing import (
6+
TYPE_CHECKING,
7+
Dict,
8+
Hashable,
9+
List,
10+
Sequence,
11+
Type,
12+
TypeVar,
13+
Union,
14+
cast,
15+
)
616
from warnings import warn
717

818
import numpy as np
@@ -58,6 +68,10 @@
5868

5969
from pandas.io.formats import console
6070

71+
if TYPE_CHECKING:
72+
from pandas import Index
73+
74+
6175
CategoricalT = TypeVar("CategoricalT", bound="Categorical")
6276

6377

@@ -1708,13 +1722,7 @@ def fillna(self, value=None, method=None, limit=None):
17081722
mask = self.isna()
17091723

17101724
new_codes = self._validate_setitem_value(value)
1711-
1712-
if isinstance(value, (np.ndarray, Categorical)):
1713-
# We get ndarray or Categorical if called via Series.fillna,
1714-
# where it will unwrap another aligned Series before getting here
1715-
codes[mask] = new_codes[mask]
1716-
else:
1717-
codes[mask] = new_codes
1725+
np.putmask(codes, mask, new_codes)
17181726

17191727
return self._from_backing_data(codes)
17201728

@@ -2510,7 +2518,7 @@ def _delegate_method(self, name, *args, **kwargs):
25102518
# utility routines
25112519

25122520

2513-
def _get_codes_for_values(values, categories) -> np.ndarray:
2521+
def _get_codes_for_values(values, categories: "Index") -> np.ndarray:
25142522
"""
25152523
utility routine to turn values into codes given the specified categories
25162524

pandas/core/frame.py

+8-19
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,6 @@
112112
is_integer_dtype,
113113
is_iterator,
114114
is_list_like,
115-
is_named_tuple,
116115
is_object_dtype,
117116
is_scalar,
118117
is_sequence,
@@ -129,7 +128,7 @@
129128
transform,
130129
)
131130
from pandas.core.arraylike import OpsMixin
132-
from pandas.core.arrays import Categorical, ExtensionArray
131+
from pandas.core.arrays import ExtensionArray
133132
from pandas.core.arrays.sparse import SparseFrameAccessor
134133
from pandas.core.construction import extract_array, sanitize_masked_array
135134
from pandas.core.generic import NDFrame, _shared_docs
@@ -147,13 +146,14 @@
147146
from pandas.core.internals.construction import (
148147
arrays_to_mgr,
149148
dataclasses_to_dicts,
150-
get_names_from_index,
151149
init_dict,
152150
init_ndarray,
153151
masked_rec_array_to_mgr,
152+
nested_data_to_arrays,
154153
reorder_arrays,
155154
sanitize_index,
156155
to_arrays,
156+
treat_as_nested,
157157
)
158158
from pandas.core.reshape.melt import melt
159159
from pandas.core.series import Series
@@ -565,27 +565,16 @@ def __init__(
565565
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
566566

567567
# For data is list-like, or Iterable (will consume into list)
568-
elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
568+
elif is_list_like(data):
569569
if not isinstance(data, (abc.Sequence, ExtensionArray)):
570570
data = list(data)
571571
if len(data) > 0:
572572
if is_dataclass(data[0]):
573573
data = dataclasses_to_dicts(data)
574-
if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
575-
if is_named_tuple(data[0]) and columns is None:
576-
columns = data[0]._fields
577-
arrays, columns = to_arrays(data, columns, dtype=dtype)
578-
columns = ensure_index(columns)
579-
580-
# set the index
581-
if index is None:
582-
if isinstance(data[0], Series):
583-
index = get_names_from_index(data)
584-
elif isinstance(data[0], Categorical):
585-
index = ibase.default_index(len(data[0]))
586-
else:
587-
index = ibase.default_index(len(data))
588-
574+
if treat_as_nested(data):
575+
arrays, columns, index = nested_data_to_arrays(
576+
data, columns, index, dtype
577+
)
589578
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
590579
else:
591580
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)

pandas/core/internals/construction.py

+38-2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
is_extension_array_dtype,
2828
is_integer_dtype,
2929
is_list_like,
30+
is_named_tuple,
3031
is_object_dtype,
3132
)
3233
from pandas.core.dtypes.generic import (
@@ -106,7 +107,7 @@ def masked_rec_array_to_mgr(
106107
# essentially process a record array then fill it
107108
fdata = ma.getdata(data)
108109
if index is None:
109-
index = get_names_from_index(fdata)
110+
index = _get_names_from_index(fdata)
110111
if index is None:
111112
index = ibase.default_index(len(data))
112113
index = ensure_index(index)
@@ -286,6 +287,41 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
286287
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
287288

288289

290+
def nested_data_to_arrays(
291+
data: Sequence,
292+
columns: Optional[Index],
293+
index: Optional[Index],
294+
dtype: Optional[DtypeObj],
295+
):
296+
"""
297+
Convert a single sequence of arrays to multiple arrays.
298+
"""
299+
# By the time we get here we have already checked treat_as_nested(data)
300+
301+
if is_named_tuple(data[0]) and columns is None:
302+
columns = data[0]._fields
303+
304+
arrays, columns = to_arrays(data, columns, dtype=dtype)
305+
columns = ensure_index(columns)
306+
307+
if index is None:
308+
if isinstance(data[0], ABCSeries):
309+
index = _get_names_from_index(data)
310+
elif isinstance(data[0], Categorical):
311+
index = ibase.default_index(len(data[0]))
312+
else:
313+
index = ibase.default_index(len(data))
314+
315+
return arrays, columns, index
316+
317+
318+
def treat_as_nested(data) -> bool:
319+
"""
320+
Check if we should use nested_data_to_arrays.
321+
"""
322+
return len(data) > 0 and is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1
323+
324+
289325
# ---------------------------------------------------------------------
290326

291327

@@ -432,7 +468,7 @@ def reorder_arrays(arrays, arr_columns, columns):
432468
return arrays, arr_columns
433469

434470

435-
def get_names_from_index(data):
471+
def _get_names_from_index(data):
436472
has_some_name = any(getattr(s, "name", None) is not None for s in data)
437473
if not has_some_name:
438474
return ibase.default_index(len(data))

pandas/tests/series/apply/test_series_apply.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -778,10 +778,14 @@ def test_map_missing_mixed(self, vals, mapping, exp):
778778
),
779779
],
780780
)
781-
def test_apply_series_on_date_time_index_aware_series(self, dti, exp):
781+
@pytest.mark.parametrize("aware", [True, False])
782+
def test_apply_series_on_date_time_index_aware_series(self, dti, exp, aware):
782783
# GH 25959
783784
# Calling apply on a localized time series should not cause an error
784-
index = dti.tz_localize("UTC").index
785+
if aware:
786+
index = dti.tz_localize("UTC").index
787+
else:
788+
index = dti.index
785789
result = Series(index).apply(lambda x: Series([1, 2]))
786790
tm.assert_frame_equal(result, exp)
787791

0 commit comments

Comments
 (0)