BUG: 2D ndarray of dtype 'object' is always copied upon construction (pandas-dev#39263)

irgolic · irgolic · commit 9486226788a7 · 2021-01-19T16:29:33.000Z
diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst
@@ -49,6 +49,7 @@ Bug fixes
 - Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings. This resulted in a regression in some cases as the default for ``float_precision`` was changed in pandas 1.2.0 (:issue:`38753`)
 - Bug in :func:`read_csv` not closing an opened file handle when a ``csv.Error`` or ``UnicodeDecodeError`` occurred while initializing (:issue:`39024`)
 - Bug in :func:`pandas.testing.assert_index_equal` raising ``TypeError`` with ``check_order=False`` when :class:`Index` has mixed dtype (:issue:`39168`)
+- Bug in :func:`pandas.core.internals.construction.init_ndarray` unnecessarily copying all object arrays after datetime inference
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -3,6 +3,7 @@
 constructors before passing them to a BlockManager.
 """
 from collections import abc
+from itertools import groupby
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -240,18 +241,29 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
         if values.ndim == 2 and values.shape[0] != 1:
             # transpose and separate blocks
 
-            dvals_list = [maybe_infer_to_datetimelike(row) for row in values]
-            for n in range(len(dvals_list)):
-                if isinstance(dvals_list[n], np.ndarray):
-                    dvals_list[n] = dvals_list[n].reshape(1, -1)
+            dvals_list = (maybe_infer_to_datetimelike(row) for row in values)
 
             from pandas.core.internals.blocks import make_block
 
-            # TODO: What about re-joining object columns?
-            block_values = [
-                make_block(dvals_list[n], placement=[n], ndim=2)
-                for n in range(len(dvals_list))
-            ]
+            i = 0
+            block_values = []
+            for is_object, group in groupby(
+                dvals_list, lambda row: is_object_dtype(row.dtype)
+            ):
+                dval_group = list(group)
+                ei = i + len(dval_group)
+                if is_object:
+                    block_values.append(values[i:ei])
+                else:
+                    block_values.extend(
+                        make_block(
+                            row.reshape(1, -1) if isinstance(row, np.ndarray) else row,
+                            placement=[i + incr],
+                            ndim=2,
+                        )
+                        for incr, row in enumerate(dval_group)
+                    )
+                i = ei
 
         else:
             datelike_vals = maybe_infer_to_datetimelike(values)
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -2267,6 +2267,14 @@ def test_nested_dict_construction(self):
         )
         tm.assert_frame_equal(result, expected)
 
+    def test_object_array_does_not_copy(self):
+        a = np.array(["a", "b"], dtype="object")
+        b = np.array([["a", "b"], ["c", "d"]], dtype="object")
+        df = DataFrame(a)
+        assert np.shares_memory(df.values, a)
+        df2 = DataFrame(b)
+        assert np.shares_memory(df2.values, b)
+
     def test_from_tzaware_object_array(self):
         # GH#26825 2D object array of tzaware timestamps should not raise
         dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")