From db1020cb4281a00893163706b41292f02290698c Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Feb 2022 20:18:05 -0800 Subject: [PATCH 1/3] REF: avoid ravel/reshape in astype_nansafe, ndarray_to_mgr --- pandas/core/dtypes/astype.py | 15 +++++++-------- pandas/core/internals/construction.py | 12 ++++++------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 0e7bb1ed293d8..1e78bf0cd33ae 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -83,12 +83,6 @@ def astype_nansafe( ValueError The dtype was a datetime64/timedelta64 dtype, but it had no unit. """ - if arr.ndim > 1: - flat = arr.ravel() - result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna) - # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no - # attribute "reshape" - return result.reshape(arr.shape) # type: ignore[union-attr] # We get here with 0-dim from sparse arr = np.atleast_1d(arr) @@ -109,7 +103,12 @@ def astype_nansafe( return arr.astype(dtype, copy=copy) if issubclass(dtype.type, str): - return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False) + shape = arr.shape + if arr.ndim > 1: + arr = arr.ravel() + return lib.ensure_string_array( + arr, skipna=skipna, convert_na_value=False + ).reshape(shape) elif is_datetime64_dtype(arr.dtype): if dtype == np.int64: @@ -146,7 +145,7 @@ def astype_nansafe( from pandas import to_datetime return astype_nansafe( - to_datetime(arr).values, + to_datetime(arr.ravel()).values.reshape(arr.shape), dtype, copy=copy, ) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 03016371dd553..8451dcb6e412a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -329,18 +329,18 @@ def ndarray_to_mgr( values = _prep_ndarray(values, copy=copy_on_sanitize) if dtype is not None and not is_dtype_equal(values.dtype, dtype): - shape = values.shape - flat = values.ravel() - # GH#40110 see similar check inside sanitize_array rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") values = sanitize_array( - flat, None, dtype=dtype, copy=copy_on_sanitize, raise_cast_failure=rcf + values, + None, + dtype=dtype, + copy=copy_on_sanitize, + raise_cast_failure=rcf, + allow_2d=True, ) - values = values.reshape(shape) - # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes( values.shape[0], values.shape[1], index=index, columns=columns From 597cfb8975bcfb63593e1f3ee1fad9817af02bc6 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Feb 2022 12:30:43 -0800 Subject: [PATCH 2/3] fix calling in ensure_string_array --- pandas/core/construction.py | 10 +++++++++- pandas/tests/frame/test_constructors.py | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 10637db555dd8..0e1e2eae27c7e 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -786,7 +786,15 @@ def _try_cast( elif dtype.kind == "U": # TODO: test cases with arr.dtype.kind in ["m", "M"] - return lib.ensure_string_array(arr, convert_na_value=False, copy=copy) + if is_ndarray: + shape = arr.shape + if arr.ndim > 1: + arr = arr.ravel() + else: + shape = (len(arr),) + return lib.ensure_string_array(arr, convert_na_value=False, copy=copy).reshape( + shape + ) elif dtype.kind in ["m", "M"]: return maybe_cast_to_datetime(arr, dtype) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 68572941b19c0..9675f16ff61ad 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -70,6 +70,14 @@ class TestDataFrameConstructors: + def test_constructor_from_ndarray_with_str_dtype(self): + # If we don't ravel/reshape around ensure_str_array, we end up + # with an array of strings each of which is e.g. "[0 1 2]" + arr = np.arange(12).reshape(4, 3) + df = DataFrame(arr, dtype=str) + expected = DataFrame(arr.astype(str)) + tm.assert_frame_equal(df, expected) + def test_constructor_from_2d_datetimearray(self, using_array_manager): dti = date_range("2016-01-01", periods=6, tz="US/Pacific") dta = dti._data.reshape(3, 2) From 39b0f77fde48bb0064c78d06d14b7ac460226268 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Feb 2022 19:47:15 -0800 Subject: [PATCH 3/3] mypy fixup --- pandas/core/construction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 0e1e2eae27c7e..17cdf6665aa99 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -787,6 +787,7 @@ def _try_cast( elif dtype.kind == "U": # TODO: test cases with arr.dtype.kind in ["m", "M"] if is_ndarray: + arr = cast(np.ndarray, arr) shape = arr.shape if arr.ndim > 1: arr = arr.ravel()