pandas-dev · simonjayhawkins · Aug 30, 2022 · Jul 17, 2022 · Jul 17, 2022 · Aug 12, 2022
diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst
@@ -17,6 +17,7 @@ Fixed regressions
 - Fixed regression in :meth:`DataFrame.fillna` not working :class:`DataFrame` with :class:`MultiIndex` (:issue:`47649`)
 - Fixed regression in taking NULL :class:`objects` from a :class:`DataFrame` causing a segmentation violation. These NULL values are created by :meth:`numpy.empty_like` (:issue:`46848`)
 - Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`)
+- Fixed regression in :func:`concat` or :func:`merge` handling of all-NaN ExtensionArrays with custom attributes (:issue:`47762`)
 - Fixed regression in calling bitwise numpy ufuncs (for example, ``np.bitwise_and``) on Index objects (:issue:`46769`)
 - Fixed regression in :func:`cut` using a ``datetime64`` IntervalIndex as bins (:issue:`46218`)
 - Fixed regression in :meth:`DataFrame.select_dtypes` where ``include="number"`` included :class:`BooleanDtype` (:issue:`46870`)

diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -471,7 +471,11 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
                     if len(values) and values[0] is None:
                         fill_value = None
 
-                if is_datetime64tz_dtype(empty_dtype):
+                if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers:
+                    # avoid creating new empty array if we already have an array
+                    # with correct dtype that can be reindexed
+                    pass
+                elif is_datetime64tz_dtype(empty_dtype):
                     i8values = np.full(self.shape, fill_value.value)
                     return DatetimeArray(i8values, dtype=empty_dtype)
 

diff --git a/pandas/tests/extension/array_with_attr/__init__.py b/pandas/tests/extension/array_with_attr/__init__.py
@@ -0,0 +1,7 @@
+from pandas.tests.extension.array_with_attr.array import (
+    FloatAttrArray,
+    FloatAttrDtype,
+    make_data,
+)
+
+__all__ = ["FloatAttrArray", "FloatAttrDtype", "make_data"]
diff --git a/pandas/tests/extension/array_with_attr/array.py b/pandas/tests/extension/array_with_attr/array.py
@@ -0,0 +1,88 @@
+"""
+Test extension array that has custom attribute information (not stored on the dtype).
+
+"""
+from __future__ import annotations
+
+import numbers
+
+import numpy as np
+
+from pandas._typing import type_t
+
+from pandas.core.dtypes.base import ExtensionDtype
+
+import pandas as pd
+from pandas.core.arrays import ExtensionArray
+
+
+class FloatAttrDtype(ExtensionDtype):
+    type = int
+    name = "int_attr"
+    na_value = np.nan
+
+    @classmethod
+    def construct_array_type(cls) -> type_t[FloatAttrArray]:
+        """
+        Return the array type associated with this dtype.
+
+        Returns
+        -------
+        type
+        """
+        return FloatAttrArray
+
+
+class FloatAttrArray(ExtensionArray):
+    dtype = FloatAttrDtype()
+    __array_priority__ = 1000
+
+    def __init__(self, values, attr=None) -> None:
+        if not isinstance(values, np.ndarray):
+            raise TypeError("Need to pass a numpy array of float64 dtype as values")
+        if not values.dtype == "float64":
+            raise TypeError("Need to pass a numpy array of float64 dtype as values")
+        self.data = values
+        self.attr = attr
+
+    @classmethod
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
+        data = np.array(scalars, dtype="float64", copy=copy)
+        return cls(data)
+
+    def __getitem__(self, item):
+        if isinstance(item, numbers.Integral):
+            return self.data[item]
+        else:
+            # slice, list-like, mask
+            item = pd.api.indexers.check_array_indexer(self, item)
+            return type(self)(self.data[item], self.attr)
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def isna(self):
+        return np.isnan(self.data)
+
+    def take(self, indexer, allow_fill=False, fill_value=None):
+        from pandas.api.extensions import take
+
+        data = self.data
+        if allow_fill and fill_value is None:
+            fill_value = self.dtype.na_value
+
+        result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
+        return type(self)(result, self.attr)
+
+    def copy(self):
+        return type(self)(self.data.copy(), self.attr)
+
+    @classmethod
+    def _concat_same_type(cls, to_concat):
+        data = np.concatenate([x.data for x in to_concat])
+        attr = to_concat[0].attr if len(to_concat) else None
+        return cls(data, attr)
+
+
+def make_data():
+    return np.arange(100, dtype="float64")
diff --git a/pandas/tests/extension/array_with_attr/test_array_with_attr.py b/pandas/tests/extension/array_with_attr/test_array_with_attr.py
@@ -0,0 +1,48 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.tests.extension.array_with_attr import (
+    FloatAttrArray,
+    FloatAttrDtype,
+    make_data,
+)
+
+
+@pytest.fixture
+def dtype():
+    return FloatAttrDtype()
+
+
+@pytest.fixture
+def data():
+    return FloatAttrArray(make_data())
+
+
+def test_concat_with_all_na(data):
+    # https://github.com/pandas-dev/pandas/pull/47762
+    # ensure that attribute of the column array is preserved (when it gets
+    # preserved in reindexing the array) during merge/concat
+    arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test")
+
+    df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
+    df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]})
+    result = pd.merge(df1, df2, on="key")
+    expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]})
+    tm.assert_frame_equal(result, expected)
+    assert result["col"].array.attr == "test"
+
+    df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
+    df2 = pd.DataFrame({"key": [0, 2], "col2": [1, 2]})
+    result = pd.merge(df1, df2, on="key")
+    expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]})
+    tm.assert_frame_equal(result, expected)
+    assert result["col"].array.attr == "test"
+
+    result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1)
+    expected = pd.DataFrame(
+        {"col": arr.take([0, 0, 1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]}
+    ).set_index("key")
+    tm.assert_frame_equal(result, expected)
+    assert result["col"].array.attr == "test"