Skip to content

Commit e89624f

Browse files
REGR: preserve array object for concat with all-NA array
1 parent fc68a9a commit e89624f

File tree

5 files changed

+134
-1
lines changed

5 files changed

+134
-1
lines changed

doc/source/whatsnew/v1.4.4.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ including other versions of pandas.
1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
1717
- Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`)
18+
- Fixed regression in :func:`concat` or :func:`merge` handling of all-NaN ExtensionArrays with custom attributes
1819
-
1920

2021
.. ---------------------------------------------------------------------------

pandas/core/internals/concat.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,11 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
471471
if len(values) and values[0] is None:
472472
fill_value = None
473473

474-
if is_datetime64tz_dtype(empty_dtype):
474+
if blk_dtype == empty_dtype and self.indexers:
475+
# avoid creating new empty array if we already have an array
476+
# with correct dtype that can be reindexed
477+
pass
478+
elif is_datetime64tz_dtype(empty_dtype):
475479
i8values = np.full(self.shape, fill_value.value)
476480
return DatetimeArray(i8values, dtype=empty_dtype)
477481

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from pandas.tests.extension.array_with_attr.array import (
2+
FloatAttrArray,
3+
FloatAttrDtype,
4+
make_data,
5+
)
6+
7+
__all__ = ["FloatAttrArray", "FloatAttrDtype", "make_data"]
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
"""
2+
Test extension array that has custom attribute information (not stored on the dtype).
3+
4+
"""
5+
from __future__ import annotations
6+
7+
import numbers
8+
9+
import numpy as np
10+
11+
from pandas._typing import type_t
12+
13+
from pandas.core.dtypes.base import ExtensionDtype
14+
15+
import pandas as pd
16+
from pandas.core.arrays import ExtensionArray
17+
18+
19+
class FloatAttrDtype(ExtensionDtype):
20+
type = int
21+
name = "int_attr"
22+
na_value = np.nan
23+
24+
@classmethod
25+
def construct_array_type(cls) -> type_t[FloatAttrArray]:
26+
"""
27+
Return the array type associated with this dtype.
28+
29+
Returns
30+
-------
31+
type
32+
"""
33+
return FloatAttrArray
34+
35+
36+
class FloatAttrArray(ExtensionArray):
37+
dtype = FloatAttrDtype()
38+
__array_priority__ = 1000
39+
40+
def __init__(self, values, attr=None) -> None:
41+
if not isinstance(values, np.ndarray):
42+
raise TypeError("Need to pass a numpy array of float64 dtype as values")
43+
if not values.dtype == "float64":
44+
raise TypeError("Need to pass a numpy array of float64 dtype as values")
45+
self.data = values
46+
self.attr = attr
47+
48+
@classmethod
49+
def _from_sequence(cls, scalars, dtype=None, copy=False):
50+
data = np.array(scalars, dtype="float64", copy=copy)
51+
return cls(data)
52+
53+
def __getitem__(self, item):
54+
if isinstance(item, numbers.Integral):
55+
return self.data[item]
56+
else:
57+
# slice, list-like, mask
58+
item = pd.api.indexers.check_array_indexer(self, item)
59+
return type(self)(self.data[item], self.attr)
60+
61+
def __len__(self) -> int:
62+
return len(self.data)
63+
64+
def isna(self):
65+
return np.isnan(self.data)
66+
67+
def take(self, indexer, allow_fill=False, fill_value=None):
68+
from pandas.api.extensions import take
69+
70+
data = self.data
71+
if allow_fill and fill_value is None:
72+
fill_value = self.dtype.na_value
73+
74+
result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
75+
return type(self)(result, self.attr)
76+
77+
def copy(self):
78+
return type(self)(self.data.copy(), self.attr)
79+
80+
@classmethod
81+
def _concat_same_type(cls, to_concat):
82+
data = np.concatenate([x.data for x in to_concat])
83+
attr = to_concat[0].attr if len(to_concat) else None
84+
return cls(data, attr)
85+
86+
87+
def make_data():
88+
return np.arange(100, dtype="float64")
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import numpy as np
2+
import pytest
3+
4+
import pandas as pd
5+
import pandas._testing as tm
6+
from pandas.tests.extension.array_with_attr import (
7+
FloatAttrArray,
8+
FloatAttrDtype,
9+
make_data,
10+
)
11+
12+
13+
@pytest.fixture
14+
def dtype():
15+
return FloatAttrDtype()
16+
17+
18+
@pytest.fixture
19+
def data():
20+
return FloatAttrArray(make_data())
21+
22+
23+
def test_concat_with_all_na(data):
24+
# https://github.com/pandas-dev/pandas/issues/28840
25+
#
26+
arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test")
27+
df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
28+
df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]})
29+
30+
result = pd.merge(df1, df2, on="key")
31+
expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]})
32+
tm.assert_frame_equal(result, expected)
33+
assert result["col"].array.attr == "test"

0 commit comments

Comments
 (0)