Skip to content

Commit 84ea331

Browse files
jorisvandenbosschemeeseeksmachine
authored andcommitted
Backport PR pandas-dev#47762: REGR: preserve reindexed array object (instead of creating new array) for concat with all-NA array
1 parent 3ca5773 commit 84ea331

File tree

5 files changed

+139
-10
lines changed

5 files changed

+139
-10
lines changed

doc/source/whatsnew/v1.4.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717
- Fixed regression in :meth:`DataFrame.fillna` not working :class:`DataFrame` with :class:`MultiIndex` (:issue:`47649`)
1818
- Fixed regression in taking NULL :class:`objects` from a :class:`DataFrame` causing a segmentation violation. These NULL values are created by :meth:`numpy.empty_like` (:issue:`46848`)
1919
- Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`)
20+
- Fixed regression in :func:`concat` or :func:`merge` handling of all-NaN ExtensionArrays with custom attributes (:issue:`47762`)
2021
- Fixed regression in calling bitwise numpy ufuncs (for example, ``np.bitwise_and``) on Index objects (:issue:`46769`)
2122
- Fixed regression in :func:`cut` using a ``datetime64`` IntervalIndex as bins (:issue:`46218`)
2223
- Fixed regression in :meth:`DataFrame.select_dtypes` where ``include="number"`` included :class:`BooleanDtype` (:issue:`46870`)

pandas/core/internals/concat.py

+15-10
Original file line numberDiff line numberDiff line change
@@ -476,16 +476,21 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
476476
return DatetimeArray(i8values, dtype=empty_dtype)
477477

478478
elif is_1d_only_ea_dtype(empty_dtype):
479-
empty_dtype = cast(ExtensionDtype, empty_dtype)
480-
cls = empty_dtype.construct_array_type()
481-
482-
missing_arr = cls._from_sequence([], dtype=empty_dtype)
483-
ncols, nrows = self.shape
484-
assert ncols == 1, ncols
485-
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
486-
return missing_arr.take(
487-
empty_arr, allow_fill=True, fill_value=fill_value
488-
)
479+
if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers:
480+
# avoid creating new empty array if we already have an array
481+
# with correct dtype that can be reindexed
482+
pass
483+
else:
484+
empty_dtype = cast(ExtensionDtype, empty_dtype)
485+
cls = empty_dtype.construct_array_type()
486+
487+
missing_arr = cls._from_sequence([], dtype=empty_dtype)
488+
ncols, nrows = self.shape
489+
assert ncols == 1, ncols
490+
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
491+
return missing_arr.take(
492+
empty_arr, allow_fill=True, fill_value=fill_value
493+
)
489494
elif isinstance(empty_dtype, ExtensionDtype):
490495
# TODO: no tests get here, a handful would if we disabled
491496
# the dt64tz special-case above (which is faster)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from pandas.tests.extension.array_with_attr.array import (
2+
FloatAttrArray,
3+
FloatAttrDtype,
4+
)
5+
6+
__all__ = ["FloatAttrArray", "FloatAttrDtype"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""
2+
Test extension array that has custom attribute information (not stored on the dtype).
3+
4+
"""
5+
from __future__ import annotations
6+
7+
import numbers
8+
9+
import numpy as np
10+
11+
from pandas._typing import type_t
12+
13+
from pandas.core.dtypes.base import ExtensionDtype
14+
15+
import pandas as pd
16+
from pandas.core.arrays import ExtensionArray
17+
18+
19+
class FloatAttrDtype(ExtensionDtype):
20+
type = float
21+
name = "float_attr"
22+
na_value = np.nan
23+
24+
@classmethod
25+
def construct_array_type(cls) -> type_t[FloatAttrArray]:
26+
"""
27+
Return the array type associated with this dtype.
28+
29+
Returns
30+
-------
31+
type
32+
"""
33+
return FloatAttrArray
34+
35+
36+
class FloatAttrArray(ExtensionArray):
37+
dtype = FloatAttrDtype()
38+
__array_priority__ = 1000
39+
40+
def __init__(self, values, attr=None) -> None:
41+
if not isinstance(values, np.ndarray):
42+
raise TypeError("Need to pass a numpy array of float64 dtype as values")
43+
if not values.dtype == "float64":
44+
raise TypeError("Need to pass a numpy array of float64 dtype as values")
45+
self.data = values
46+
self.attr = attr
47+
48+
@classmethod
49+
def _from_sequence(cls, scalars, dtype=None, copy=False):
50+
data = np.array(scalars, dtype="float64", copy=copy)
51+
return cls(data)
52+
53+
def __getitem__(self, item):
54+
if isinstance(item, numbers.Integral):
55+
return self.data[item]
56+
else:
57+
# slice, list-like, mask
58+
item = pd.api.indexers.check_array_indexer(self, item)
59+
return type(self)(self.data[item], self.attr)
60+
61+
def __len__(self) -> int:
62+
return len(self.data)
63+
64+
def isna(self):
65+
return np.isnan(self.data)
66+
67+
def take(self, indexer, allow_fill=False, fill_value=None):
68+
from pandas.api.extensions import take
69+
70+
data = self.data
71+
if allow_fill and fill_value is None:
72+
fill_value = self.dtype.na_value
73+
74+
result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
75+
return type(self)(result, self.attr)
76+
77+
def copy(self):
78+
return type(self)(self.data.copy(), self.attr)
79+
80+
@classmethod
81+
def _concat_same_type(cls, to_concat):
82+
data = np.concatenate([x.data for x in to_concat])
83+
attr = to_concat[0].attr if len(to_concat) else None
84+
return cls(data, attr)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import numpy as np
2+
3+
import pandas as pd
4+
import pandas._testing as tm
5+
from pandas.tests.extension.array_with_attr import FloatAttrArray
6+
7+
8+
def test_concat_with_all_na():
9+
# https://github.com/pandas-dev/pandas/pull/47762
10+
# ensure that attribute of the column array is preserved (when it gets
11+
# preserved in reindexing the array) during merge/concat
12+
arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test")
13+
14+
df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
15+
df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]})
16+
result = pd.merge(df1, df2, on="key")
17+
expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]})
18+
tm.assert_frame_equal(result, expected)
19+
assert result["col"].array.attr == "test"
20+
21+
df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
22+
df2 = pd.DataFrame({"key": [0, 2], "col2": [1, 2]})
23+
result = pd.merge(df1, df2, on="key")
24+
expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]})
25+
tm.assert_frame_equal(result, expected)
26+
assert result["col"].array.attr == "test"
27+
28+
result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1)
29+
expected = pd.DataFrame(
30+
{"col": arr.take([0, 1, -1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]}
31+
).set_index("key")
32+
tm.assert_frame_equal(result, expected)
33+
assert result["col"].array.attr == "test"

0 commit comments

Comments
 (0)