Skip to content

Commit f0c4093

Browse files
[ArrayManager] REF: Implement concat with reindexing (#39612)
1 parent e97c766 commit f0c4093

File tree

12 files changed

+184
-41
lines changed

12 files changed

+184
-41
lines changed

pandas/core/dtypes/concat.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,13 @@
3030
)
3131

3232

33-
def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
33+
def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
3434
"""
3535
Helper function for `arr.astype(common_dtype)` but handling all special
3636
cases.
3737
"""
38+
if is_dtype_equal(arr.dtype, dtype):
39+
return arr
3840
if (
3941
is_categorical_dtype(arr.dtype)
4042
and isinstance(dtype, np.dtype)
@@ -121,7 +123,7 @@ def is_nonempty(x) -> bool:
121123
# for axis=0
122124
if not single_dtype:
123125
target_dtype = find_common_type([x.dtype for x in to_concat])
124-
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
126+
to_concat = [cast_to_common_type(arr, target_dtype) for arr in to_concat]
125127

126128
if isinstance(to_concat[0], ExtensionArray):
127129
cls = type(to_concat[0])

pandas/core/internals/array_manager.py

+68-3
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818
)
1919
from pandas._typing import (
2020
ArrayLike,
21+
DtypeObj,
2122
Hashable,
2223
)
2324
from pandas.util._validators import validate_bool_kwarg
2425

2526
from pandas.core.dtypes.cast import (
2627
astype_array_safe,
28+
ensure_dtype_can_hold_na,
2729
infer_dtype_from_scalar,
2830
soft_convert_objects,
2931
)
@@ -49,6 +51,7 @@
4951
from pandas.core.dtypes.missing import (
5052
array_equals,
5153
isna,
54+
na_value_for_dtype,
5255
)
5356

5457
import pandas.core.algorithms as algos
@@ -952,10 +955,18 @@ def reindex_indexer(
952955
# ignored keywords
953956
consolidate: bool = True,
954957
only_slice: bool = False,
958+
# ArrayManager specific keywords
959+
use_na_proxy: bool = False,
955960
) -> T:
956961
axis = self._normalize_axis(axis)
957962
return self._reindex_indexer(
958-
new_axis, indexer, axis, fill_value, allow_dups, copy
963+
new_axis,
964+
indexer,
965+
axis,
966+
fill_value,
967+
allow_dups,
968+
copy,
969+
use_na_proxy,
959970
)
960971

961972
def _reindex_indexer(
@@ -966,6 +977,7 @@ def _reindex_indexer(
966977
fill_value=None,
967978
allow_dups: bool = False,
968979
copy: bool = True,
980+
use_na_proxy: bool = False,
969981
) -> T:
970982
"""
971983
Parameters
@@ -1000,7 +1012,9 @@ def _reindex_indexer(
10001012
new_arrays = []
10011013
for i in indexer:
10021014
if i == -1:
1003-
arr = self._make_na_array(fill_value=fill_value)
1015+
arr = self._make_na_array(
1016+
fill_value=fill_value, use_na_proxy=use_na_proxy
1017+
)
10041018
else:
10051019
arr = self.arrays[i]
10061020
new_arrays.append(arr)
@@ -1051,7 +1065,11 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
10511065
new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
10521066
)
10531067

1054-
def _make_na_array(self, fill_value=None):
1068+
def _make_na_array(self, fill_value=None, use_na_proxy=False):
1069+
if use_na_proxy:
1070+
assert fill_value is None
1071+
return NullArrayProxy(self.shape_proper[0])
1072+
10551073
if fill_value is None:
10561074
fill_value = np.nan
10571075

@@ -1271,3 +1289,50 @@ def set_values(self, values: ArrayLike):
12711289
valid for the current SingleArrayManager (length, dtype, etc).
12721290
"""
12731291
self.arrays[0] = values
1292+
1293+
1294+
class NullArrayProxy:
1295+
"""
1296+
Proxy object for an all-NA array.
1297+
1298+
Only stores the length of the array, and not the dtype. The dtype
1299+
will only be known when actually concatenating (after determining the
1300+
common dtype, for which this proxy is ignored).
1301+
Using this object avoids that the internals/concat.py needs to determine
1302+
the proper dtype and array type.
1303+
"""
1304+
1305+
ndim = 1
1306+
1307+
def __init__(self, n: int):
1308+
self.n = n
1309+
1310+
@property
1311+
def shape(self):
1312+
return (self.n,)
1313+
1314+
def to_array(self, dtype: DtypeObj) -> ArrayLike:
1315+
"""
1316+
Helper function to create the actual all-NA array from the NullArrayProxy
1317+
object.
1318+
1319+
Parameters
1320+
----------
1321+
arr : NullArrayProxy
1322+
dtype : the dtype for the resulting array
1323+
1324+
Returns
1325+
-------
1326+
np.ndarray or ExtensionArray
1327+
"""
1328+
if isinstance(dtype, ExtensionDtype):
1329+
empty = dtype.construct_array_type()._from_sequence([], dtype=dtype)
1330+
indexer = -np.ones(self.n, dtype=np.intp)
1331+
return empty.take(indexer, allow_fill=True)
1332+
else:
1333+
# when introducing missing values, int becomes float, bool becomes object
1334+
dtype = ensure_dtype_can_hold_na(dtype)
1335+
fill_value = na_value_for_dtype(dtype)
1336+
arr = np.empty(self.n, dtype=dtype)
1337+
arr.fill(fill_value)
1338+
return ensure_wrapped_if_datetimelike(arr)

pandas/core/internals/concat.py

+74-4
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@
2828
is_extension_array_dtype,
2929
is_sparse,
3030
)
31-
from pandas.core.dtypes.concat import concat_compat
31+
from pandas.core.dtypes.concat import (
32+
cast_to_common_type,
33+
concat_compat,
34+
)
3235
from pandas.core.dtypes.dtypes import ExtensionDtype
3336
from pandas.core.dtypes.missing import (
3437
is_valid_na_for_dtype,
@@ -42,7 +45,10 @@
4245
ExtensionArray,
4346
)
4447
from pandas.core.construction import ensure_wrapped_if_datetimelike
45-
from pandas.core.internals.array_manager import ArrayManager
48+
from pandas.core.internals.array_manager import (
49+
ArrayManager,
50+
NullArrayProxy,
51+
)
4652
from pandas.core.internals.blocks import (
4753
ensure_block_shape,
4854
new_block,
@@ -74,14 +80,16 @@ def _concatenate_array_managers(
7480
mgrs = []
7581
for mgr, indexers in mgrs_indexers:
7682
for ax, indexer in indexers.items():
77-
mgr = mgr.reindex_indexer(axes[ax], indexer, axis=ax, allow_dups=True)
83+
mgr = mgr.reindex_indexer(
84+
axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True
85+
)
7886
mgrs.append(mgr)
7987

8088
if concat_axis == 1:
8189
# concatting along the rows -> concat the reindexed arrays
8290
# TODO(ArrayManager) doesn't yet preserve the correct dtype
8391
arrays = [
84-
concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))])
92+
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
8593
for j in range(len(mgrs[0].arrays))
8694
]
8795
return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
@@ -92,6 +100,68 @@ def _concatenate_array_managers(
92100
return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
93101

94102

103+
def concat_arrays(to_concat: list) -> ArrayLike:
104+
"""
105+
Alternative for concat_compat but specialized for use in the ArrayManager.
106+
107+
Differences: only deals with 1D arrays (no axis keyword), assumes
108+
ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
109+
the dtype.
110+
In addition ensures that all NullArrayProxies get replaced with actual
111+
arrays.
112+
113+
Parameters
114+
----------
115+
to_concat : list of arrays
116+
117+
Returns
118+
-------
119+
np.ndarray or ExtensionArray
120+
"""
121+
# ignore the all-NA proxies to determine the resulting dtype
122+
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
123+
124+
single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1
125+
126+
if not single_dtype:
127+
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
128+
else:
129+
target_dtype = to_concat_no_proxy[0].dtype
130+
131+
if target_dtype.kind in ["m", "M"]:
132+
# for datetimelike use DatetimeArray/TimedeltaArray concatenation
133+
# don't use arr.astype(target_dtype, copy=False), because that doesn't
134+
# work for DatetimeArray/TimedeltaArray (returns ndarray)
135+
to_concat = [
136+
arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else arr
137+
for arr in to_concat
138+
]
139+
return type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=0)
140+
141+
to_concat = [
142+
arr.to_array(target_dtype)
143+
if isinstance(arr, NullArrayProxy)
144+
else cast_to_common_type(arr, target_dtype)
145+
for arr in to_concat
146+
]
147+
148+
if isinstance(to_concat[0], ExtensionArray):
149+
cls = type(to_concat[0])
150+
return cls._concat_same_type(to_concat)
151+
152+
result = np.concatenate(to_concat)
153+
154+
# TODO decide on exact behaviour (we shouldn't do this only for empty result)
155+
# see https://github.com/pandas-dev/pandas/issues/39817
156+
if len(result) == 0:
157+
# all empties -> check for bool to not coerce to float
158+
kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
159+
if len(kinds) != 1:
160+
if "b" in kinds:
161+
result = result.astype(object)
162+
return result
163+
164+
95165
def concatenate_managers(
96166
mgrs_indexers, axes: list[Index], concat_axis: int, copy: bool
97167
) -> Manager:

pandas/tests/extension/base/reshaping.py

-3
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
import pandas.util._test_decorators as td
7-
86
import pandas as pd
97
from pandas.api.extensions import ExtensionArray
108
from pandas.core.internals import ExtensionBlock
@@ -111,7 +109,6 @@ def test_concat_extension_arrays_copy_false(self, data, na_value):
111109
result = pd.concat([df1, df2], axis=1, copy=False)
112110
self.assert_frame_equal(result, expected)
113111

114-
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) concat reindex
115112
def test_concat_with_reindex(self, data):
116113
# GH-33027
117114
a = pd.DataFrame({"a": data[:5]})

pandas/tests/frame/methods/test_append.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
import pandas.util._test_decorators as td
5-
64
import pandas as pd
75
from pandas import (
86
DataFrame,
@@ -13,9 +11,6 @@
1311
)
1412
import pandas._testing as tm
1513

16-
# TODO td.skip_array_manager_not_yet_implemented
17-
# appending with reindexing not yet working
18-
1914

2015
class TestDataFrameAppend:
2116
def test_append_multiindex(self, multiindex_dataframe_random_data, frame_or_series):
@@ -43,7 +38,6 @@ def test_append_empty_list(self):
4338
tm.assert_frame_equal(result, expected)
4439
assert result is not df # .append() should return a new object
4540

46-
@td.skip_array_manager_not_yet_implemented
4741
def test_append_series_dict(self):
4842
df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"])
4943

@@ -84,7 +78,6 @@ def test_append_series_dict(self):
8478
expected = df.append(df[-1:], ignore_index=True)
8579
tm.assert_frame_equal(result, expected)
8680

87-
@td.skip_array_manager_not_yet_implemented
8881
def test_append_list_of_series_dicts(self):
8982
df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"])
9083

@@ -103,7 +96,6 @@ def test_append_list_of_series_dicts(self):
10396
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
10497
tm.assert_frame_equal(result, expected)
10598

106-
@td.skip_array_manager_not_yet_implemented
10799
def test_append_missing_cols(self):
108100
# GH22252
109101
# exercise the conditional branch in append method where the data
@@ -148,8 +140,7 @@ def test_append_empty_dataframe(self):
148140
expected = df1.copy()
149141
tm.assert_frame_equal(result, expected)
150142

151-
@td.skip_array_manager_not_yet_implemented
152-
def test_append_dtypes(self):
143+
def test_append_dtypes(self, using_array_manager):
153144

154145
# GH 5754
155146
# row appends of different dtypes (so need to do by-item)
@@ -173,6 +164,10 @@ def test_append_dtypes(self):
173164
expected = DataFrame(
174165
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
175166
)
167+
if using_array_manager:
168+
# TODO(ArrayManager) decide on exact casting rules in concat
169+
# With ArrayManager, all-NaN float is not ignored
170+
expected = expected.astype(object)
176171
tm.assert_frame_equal(result, expected)
177172

178173
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
@@ -181,6 +176,9 @@ def test_append_dtypes(self):
181176
expected = DataFrame(
182177
{"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")}
183178
)
179+
if using_array_manager:
180+
# With ArrayManager, all-NaN float is not ignored
181+
expected = expected.astype(object)
184182
tm.assert_frame_equal(result, expected)
185183

186184
df1 = DataFrame({"bar": np.nan}, index=range(1))
@@ -189,6 +187,9 @@ def test_append_dtypes(self):
189187
expected = DataFrame(
190188
{"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")}
191189
)
190+
if using_array_manager:
191+
# With ArrayManager, all-NaN float is not ignored
192+
expected = expected.astype(object)
192193
tm.assert_frame_equal(result, expected)
193194

194195
df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1))
@@ -208,7 +209,6 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp):
208209
expected = Series(Timestamp(timestamp, tz=tz), name=0)
209210
tm.assert_series_equal(result, expected)
210211

211-
@td.skip_array_manager_not_yet_implemented
212212
@pytest.mark.parametrize(
213213
"data, dtype",
214214
[
-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +0,0 @@
1-
import pandas.util._test_decorators as td
2-
3-
# TODO(ArrayManager) concat axis=0
4-
pytestmark = td.skip_array_manager_not_yet_implemented

pandas/tests/reshape/concat/test_append.py

+6
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import numpy as np
77
import pytest
88

9+
import pandas.util._test_decorators as td
10+
911
import pandas as pd
1012
from pandas import (
1113
DataFrame,
@@ -338,6 +340,10 @@ def test_append_missing_column_proper_upcast(self, sort):
338340
assert appended["A"].dtype == "f8"
339341
assert appended["B"].dtype == "O"
340342

343+
# TODO(ArrayManager) DataFrame.append reindexes a Series itself (giving
344+
# float dtype) -> delay reindexing until concat_array_managers which properly
345+
# takes care of all-null dtype inference
346+
@td.skip_array_manager_not_yet_implemented
341347
def test_append_empty_frame_to_series_with_dateutil_tz(self):
342348
# GH 23682
343349
date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())

0 commit comments

Comments
 (0)