From 0c9d549696f73417a844206872802a2d0322775c Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 9 Jul 2021 20:12:42 -0400 Subject: [PATCH 1/8] REGR: astype changing frame order --- pandas/core/dtypes/cast.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 15a18d5027274..e588c75ff4c6d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -14,7 +14,6 @@ from typing import ( TYPE_CHECKING, Any, - Literal, Sized, TypeVar, cast, @@ -1095,13 +1094,11 @@ def astype_nansafe( """ if arr.ndim > 1: # Make sure we are doing non-copy ravel and reshape. - flags = arr.flags - flat = arr.ravel("K") + flat = arr.ravel() result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna) - order: Literal["C", "F"] = "F" if flags.f_contiguous else "C" # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no # attribute "reshape" - return result.reshape(arr.shape, order=order) # type: ignore[union-attr] + return result.reshape(arr.shape) # type: ignore[union-attr] # We get here with 0-dim from sparse arr = np.atleast_1d(arr) From bb497c9b692902d00869a4cd30458b31307d192a Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 9 Jul 2021 20:41:48 -0400 Subject: [PATCH 2/8] Add test --- pandas/tests/frame/methods/test_astype.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index f098582ca04c6..1983184cb5e13 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -670,6 +670,16 @@ def test_astype_bytes(self): result = DataFrame(["foo", "bar", "baz"]).astype(bytes) assert result.dtypes[0] == np.dtype("S3") + @pytest.mark.parametrize("step1,step2", [(6, 3), (6, 1), (1, 3)]) + def test_astype_noncontiguous(self, step1, step2): + # GH#42396 + data = np.arange(72).reshape(12, 6) + df = DataFrame(data) + + result = df.iloc[:step1, :step2].astype("int32").astype("int64") + expected = df.iloc[:step1, :step2] + tm.assert_frame_equal(result, expected) + class TestAstypeCategorical: def test_astype_from_categorical3(self): From 2b1c56180c26cb15ab57fa289cf3bfaef711ea9c Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 9 Jul 2021 20:58:30 -0400 Subject: [PATCH 3/8] PERF/REGR: astype changing order of some 2d data --- doc/source/whatsnew/v1.3.1.rst | 2 ++ pandas/tests/frame/methods/test_astype.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index 255747c3c5c6d..13be6a0a45bab 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -18,6 +18,8 @@ Fixed regressions - :class:`DataFrame` constructed with with an older version of pandas could not be unpickled (:issue:`42345`) - Performance regression in constructing a :class:`DataFrame` from a dictionary of dictionaries (:issue:`42338`) - Fixed regression in :meth:`DataFrame.agg` dropping values when the DataFrame had an Extension Array dtype, a duplicate index, and ``axis=1`` (:issue:`42380`) +- Fixed regression in :meth:`DataFrame.astype` changing the order of noncontiguous data (:issue:`42396`) +- Performance regression in :class:`DataFrame` in reduction operations requiring casting such as :meth:`DataFrame.mean` on integer data (:issue:`38592`) - .. --------------------------------------------------------------------------- diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 1983184cb5e13..70714eb62a8e2 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -670,10 +670,10 @@ def test_astype_bytes(self): result = DataFrame(["foo", "bar", "baz"]).astype(bytes) assert result.dtypes[0] == np.dtype("S3") - @pytest.mark.parametrize("step1,step2", [(6, 3), (6, 1), (1, 3)]) + @pytest.mark.parametrize("step1,step2", [(2, 2), (2, 1), (1, 2)]) def test_astype_noncontiguous(self, step1, step2): # GH#42396 - data = np.arange(72).reshape(12, 6) + data = np.arange(16).reshape(4, 4) df = DataFrame(data) result = df.iloc[:step1, :step2].astype("int32").astype("int64") From 2d0ef397002b067b161e89a46899eb3e3d62ce53 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 9 Jul 2021 21:51:14 -0400 Subject: [PATCH 4/8] Fix windows and 32 bit --- pandas/core/dtypes/cast.py | 2 +- pandas/tests/frame/methods/test_astype.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e588c75ff4c6d..71dda46ddbcd5 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1093,7 +1093,7 @@ def astype_nansafe( The dtype was a datetime64/timedelta64 dtype, but it had no unit. """ if arr.ndim > 1: - # Make sure we are doing non-copy ravel and reshape. + # TODO: try to use contiguity to avoid potentially copying here, see #42475 flat = arr.ravel() result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna) # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 70714eb62a8e2..25314ba698d2f 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -676,7 +676,7 @@ def test_astype_noncontiguous(self, step1, step2): data = np.arange(16).reshape(4, 4) df = DataFrame(data) - result = df.iloc[:step1, :step2].astype("int32").astype("int64") + result = df.iloc[:step1, :step2].astype("int16").astype(np.intp) expected = df.iloc[:step1, :step2] tm.assert_frame_equal(result, expected) From cd13406f3767562ec3e87e29c6b145f8fe89df7d Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Fri, 9 Jul 2021 23:36:25 -0400 Subject: [PATCH 5/8] Windows fix, test with stride also --- pandas/tests/frame/methods/test_astype.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 25314ba698d2f..f6b69114b280f 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -674,12 +674,16 @@ def test_astype_bytes(self): def test_astype_noncontiguous(self, step1, step2): # GH#42396 data = np.arange(16).reshape(4, 4) - df = DataFrame(data) + df = DataFrame(data, dtype=np.intp) result = df.iloc[:step1, :step2].astype("int16").astype(np.intp) expected = df.iloc[:step1, :step2] tm.assert_frame_equal(result, expected) + result = df.iloc[::step1, ::step2].astype("int16").astype(np.intp) + expected = df.iloc[::step1, ::step2] + tm.assert_frame_equal(result, expected) + class TestAstypeCategorical: def test_astype_from_categorical3(self): From 996650e43fb7a6670c703f02b26c3d8384d7d1ea Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 10 Jul 2021 09:40:50 -0400 Subject: [PATCH 6/8] Refactor test --- pandas/tests/frame/methods/test_astype.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index f6b69114b280f..c3c908120a1c3 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -670,18 +670,24 @@ def test_astype_bytes(self): result = DataFrame(["foo", "bar", "baz"]).astype(bytes) assert result.dtypes[0] == np.dtype("S3") - @pytest.mark.parametrize("step1,step2", [(2, 2), (2, 1), (1, 2)]) - def test_astype_noncontiguous(self, step1, step2): + @pytest.mark.parametrize( + "index_slice", + [ + np.s_[:2, :2], + np.s_[:1, :2], + np.s_[:2, :1], + np.s_[::2, ::2], + np.s_[::1, ::2], + np.s_[::2, ::1], + ], + ) + def test_astype_noncontiguous(self, index_slice): # GH#42396 data = np.arange(16).reshape(4, 4) df = DataFrame(data, dtype=np.intp) - result = df.iloc[:step1, :step2].astype("int16").astype(np.intp) - expected = df.iloc[:step1, :step2] - tm.assert_frame_equal(result, expected) - - result = df.iloc[::step1, ::step2].astype("int16").astype(np.intp) - expected = df.iloc[::step1, ::step2] + result = df.iloc[index_slice].astype("int16").astype(np.intp) + expected = df.iloc[index_slice] tm.assert_frame_equal(result, expected) From 79f2d5012c5ac66b50d16b3f6085226c37cb0cf5 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 10 Jul 2021 09:48:35 -0400 Subject: [PATCH 7/8] Remove TODO and simplify test --- pandas/core/dtypes/cast.py | 1 - pandas/tests/frame/methods/test_astype.py | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 71dda46ddbcd5..65acc973c00f8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1093,7 +1093,6 @@ def astype_nansafe( The dtype was a datetime64/timedelta64 dtype, but it had no unit. """ if arr.ndim > 1: - # TODO: try to use contiguity to avoid potentially copying here, see #42475 flat = arr.ravel() result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna) # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index c3c908120a1c3..4ddd0940129ce 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -684,11 +684,11 @@ def test_astype_bytes(self): def test_astype_noncontiguous(self, index_slice): # GH#42396 data = np.arange(16).reshape(4, 4) - df = DataFrame(data, dtype=np.intp) + df = DataFrame(data) - result = df.iloc[index_slice].astype("int16").astype(np.intp) - expected = df.iloc[index_slice] - tm.assert_frame_equal(result, expected) + result = df.iloc[index_slice].astype("int16").to_numpy() + expected = data[index_slice].astype("int16") + tm.assert_numpy_array_equal(result, expected) class TestAstypeCategorical: From c6a533b0f3d28664de141b8b595d1ab0172a5ea7 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 10 Jul 2021 09:51:06 -0400 Subject: [PATCH 8/8] Simplify expected --- pandas/tests/frame/methods/test_astype.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 4ddd0940129ce..1f1991214aad0 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -686,9 +686,9 @@ def test_astype_noncontiguous(self, index_slice): data = np.arange(16).reshape(4, 4) df = DataFrame(data) - result = df.iloc[index_slice].astype("int16").to_numpy() - expected = data[index_slice].astype("int16") - tm.assert_numpy_array_equal(result, expected) + result = df.iloc[index_slice].astype("int16") + expected = df.iloc[index_slice] + tm.assert_frame_equal(result, expected, check_dtype=False) class TestAstypeCategorical: