From 2d6b373d7886fd13f8daa57594c253aee10d1b7e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 17 Feb 2024 15:46:24 -0500 Subject: [PATCH 1/2] REGR: DataFrame.transpose resulting in not contiguous data on nullable EAs --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/masked.py | 4 +++- pandas/tests/frame/methods/test_transpose.py | 17 +++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 9733aff0e6eb5..f4a67d5a06742 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -30,6 +30,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) +- Fixed regression in :meth:`DataFrame.transpose` with nullable extension dtypes not having F-contiguous data potentially causing exceptions when used (:issue:`57315`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index c1ed3dacb9a95..f9387a89543a4 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1661,7 +1661,9 @@ def transpose_homogeneous_masked_arrays( arr_type = dtype.construct_array_type() transposed_arrays: list[BaseMaskedArray] = [] for i in range(transposed_values.shape[1]): - transposed_arr = arr_type(transposed_values[:, i], mask=transposed_masks[:, i]) + transposed_arr = arr_type( + transposed_values[:, i].copy(), mask=transposed_masks[:, i].copy() + ) transposed_arrays.append(transposed_arr) return transposed_arrays diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 495663ce135f9..b6be8deee6efd 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -179,3 +180,19 @@ def test_transpose_not_inferring_dt_mixed_blocks(self): dtype=object, ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype1", ["Int64", "Float64"]) + @pytest.mark.parametrize("dtype2", ["Int64", "Float64"]) + def test_transpose(self, dtype1, dtype2): + # GH#57315 - transpose should have C/F contiguous blocks + df = DataFrame( + { + "a": pd.array([1, 1, 2], dtype=dtype1), + "b": pd.array([3, 4, 5], dtype=dtype2), + } + ) + result = df.T + for blk in result._mgr.blocks: + # When dtypes are unequal, we get NumPy object array + data = blk.values._data if dtype1 == dtype2 else blk.values + assert data.flags["F_CONTIGUOUS"] From 7cf832c304bf1325996d577bb8138918dec3b942 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 18 Feb 2024 08:31:01 -0500 Subject: [PATCH 2/2] must go faster --- pandas/core/arrays/masked.py | 21 ++++++++++++++------ pandas/tests/frame/methods/test_transpose.py | 2 +- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index f9387a89543a4..c336706da45d6 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1651,19 +1651,28 @@ def transpose_homogeneous_masked_arrays( same dtype. The caller is responsible for ensuring validity of input data. """ masked_arrays = list(masked_arrays) + dtype = masked_arrays[0].dtype + values = [arr._data.reshape(1, -1) for arr in masked_arrays] - transposed_values = np.concatenate(values, axis=0) + transposed_values = np.concatenate( + values, + axis=0, + out=np.empty( + (len(masked_arrays), len(masked_arrays[0])), + order="F", + dtype=dtype.numpy_dtype, + ), + ) masks = [arr._mask.reshape(1, -1) for arr in masked_arrays] - transposed_masks = np.concatenate(masks, axis=0) + transposed_masks = np.concatenate( + masks, axis=0, out=np.empty_like(transposed_values, dtype=bool) + ) - dtype = masked_arrays[0].dtype arr_type = dtype.construct_array_type() transposed_arrays: list[BaseMaskedArray] = [] for i in range(transposed_values.shape[1]): - transposed_arr = arr_type( - transposed_values[:, i].copy(), mask=transposed_masks[:, i].copy() - ) + transposed_arr = arr_type(transposed_values[:, i], mask=transposed_masks[:, i]) transposed_arrays.append(transposed_arr) return transposed_arrays diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index b6be8deee6efd..f42fd4483e9ac 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -184,7 +184,7 @@ def test_transpose_not_inferring_dt_mixed_blocks(self): @pytest.mark.parametrize("dtype1", ["Int64", "Float64"]) @pytest.mark.parametrize("dtype2", ["Int64", "Float64"]) def test_transpose(self, dtype1, dtype2): - # GH#57315 - transpose should have C/F contiguous blocks + # GH#57315 - transpose should have F contiguous blocks df = DataFrame( { "a": pd.array([1, 1, 2], dtype=dtype1),