-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REGR: preserve reindexed array object (instead of creating new array) for concat with all-NA array #47762
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
REGR: preserve reindexed array object (instead of creating new array) for concat with all-NA array #47762
Changes from 8 commits
e89624f
7bc3b64
a5ae544
40e1009
e5e19b8
c42f54e
d76aa6f
28d72a3
3679d63
2e0e0a8
12d8480
83b1ea9
2da56d5
bfba49c
7f25c24
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -471,7 +471,11 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: | |
if len(values) and values[0] is None: | ||
fill_value = None | ||
|
||
if is_datetime64tz_dtype(empty_dtype): | ||
if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers: | ||
# avoid creating new empty array if we already have an array | ||
# with correct dtype that can be reindexed | ||
pass | ||
elif is_datetime64tz_dtype(empty_dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this didn't need to change? (just a leftover from the code move?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, good catch |
||
i8values = np.full(self.shape, fill_value.value) | ||
return DatetimeArray(i8values, dtype=empty_dtype) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from pandas.tests.extension.array_with_attr.array import ( | ||
FloatAttrArray, | ||
FloatAttrDtype, | ||
make_data, | ||
) | ||
|
||
__all__ = ["FloatAttrArray", "FloatAttrDtype", "make_data"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
""" | ||
Test extension array that has custom attribute information (not stored on the dtype). | ||
|
||
""" | ||
from __future__ import annotations | ||
|
||
import numbers | ||
|
||
import numpy as np | ||
|
||
from pandas._typing import type_t | ||
|
||
from pandas.core.dtypes.base import ExtensionDtype | ||
|
||
import pandas as pd | ||
from pandas.core.arrays import ExtensionArray | ||
|
||
|
||
class FloatAttrDtype(ExtensionDtype): | ||
type = int | ||
name = "int_attr" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these should be float? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't matter for the test, but yet that's less confusing ;) |
||
na_value = np.nan | ||
|
||
@classmethod | ||
def construct_array_type(cls) -> type_t[FloatAttrArray]: | ||
""" | ||
Return the array type associated with this dtype. | ||
|
||
Returns | ||
------- | ||
type | ||
""" | ||
return FloatAttrArray | ||
|
||
|
||
class FloatAttrArray(ExtensionArray): | ||
dtype = FloatAttrDtype() | ||
__array_priority__ = 1000 | ||
|
||
def __init__(self, values, attr=None) -> None: | ||
if not isinstance(values, np.ndarray): | ||
raise TypeError("Need to pass a numpy array of float64 dtype as values") | ||
if not values.dtype == "float64": | ||
raise TypeError("Need to pass a numpy array of float64 dtype as values") | ||
self.data = values | ||
self.attr = attr | ||
|
||
@classmethod | ||
def _from_sequence(cls, scalars, dtype=None, copy=False): | ||
data = np.array(scalars, dtype="float64", copy=copy) | ||
return cls(data) | ||
|
||
def __getitem__(self, item): | ||
if isinstance(item, numbers.Integral): | ||
return self.data[item] | ||
else: | ||
# slice, list-like, mask | ||
item = pd.api.indexers.check_array_indexer(self, item) | ||
return type(self)(self.data[item], self.attr) | ||
|
||
def __len__(self) -> int: | ||
return len(self.data) | ||
|
||
def isna(self): | ||
return np.isnan(self.data) | ||
|
||
def take(self, indexer, allow_fill=False, fill_value=None): | ||
from pandas.api.extensions import take | ||
|
||
data = self.data | ||
if allow_fill and fill_value is None: | ||
fill_value = self.dtype.na_value | ||
|
||
result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill) | ||
return type(self)(result, self.attr) | ||
|
||
def copy(self): | ||
return type(self)(self.data.copy(), self.attr) | ||
simonjayhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
@classmethod | ||
def _concat_same_type(cls, to_concat): | ||
data = np.concatenate([x.data for x in to_concat]) | ||
attr = to_concat[0].attr if len(to_concat) else None | ||
return cls(data, attr) | ||
|
||
|
||
def make_data(): | ||
return np.arange(100, dtype="float64") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
import pandas as pd | ||
import pandas._testing as tm | ||
from pandas.tests.extension.array_with_attr import ( | ||
FloatAttrArray, | ||
FloatAttrDtype, | ||
make_data, | ||
) | ||
|
||
|
||
@pytest.fixture | ||
def dtype(): | ||
return FloatAttrDtype() | ||
|
||
|
||
@pytest.fixture | ||
def data(): | ||
return FloatAttrArray(make_data()) | ||
|
||
|
||
def test_concat_with_all_na(data): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we don't use the fixture |
||
# https://github.com/pandas-dev/pandas/pull/47762 | ||
# ensure that attribute of the column array is preserved (when it gets | ||
# preserved in reindexing the array) during merge/concat | ||
arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test") | ||
|
||
df1 = pd.DataFrame({"col": arr, "key": [0, 1]}) | ||
df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]}) | ||
result = pd.merge(df1, df2, on="key") | ||
expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]}) | ||
tm.assert_frame_equal(result, expected) | ||
assert result["col"].array.attr == "test" | ||
|
||
df1 = pd.DataFrame({"col": arr, "key": [0, 1]}) | ||
df2 = pd.DataFrame({"key": [0, 2], "col2": [1, 2]}) | ||
result = pd.merge(df1, df2, on="key") | ||
expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]}) | ||
tm.assert_frame_equal(result, expected) | ||
assert result["col"].array.attr == "test" | ||
|
||
result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1) | ||
expected = pd.DataFrame( | ||
{"col": arr.take([0, 0, 1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should be arr.take([ 0, 1, -1]? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since it is all NaNs, it actually doesn't matter what we are taking. But changed it to make it less confusing |
||
).set_index("key") | ||
tm.assert_frame_equal(result, expected) | ||
assert result["col"].array.attr == "test" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a reason this is caught here, instead of in the
elif is_1d_only_ea_dtype
block. In the OP you mention the benefits for EA types. What are the benefits for using this path for the builtin dtypes? (and for a targeted backport?)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can move it into the
is_1d_only_ea_dtype
block. That would fix it for the specific use case of custom extension dtypes.Looking a bit more in detail, if we have a numpy dtype, the approach with np.empty might actually faster than
take
:So that's a good reason to keep the check specific for extension dtypes.