From 00df6516fb366f87e076d7e632a347136710ecd8 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 21 Dec 2021 15:27:09 -0800 Subject: [PATCH 1/5] PERF: DataFrame(pytorch_tensor) --- asv_bench/benchmarks/frame_ctor.py | 17 +++++++++++++++++ pandas/core/frame.py | 9 +++++++-- pandas/tests/test_downstream.py | 13 ++++++++++++- 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 912971257490c..354dc5a8cea04 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -182,4 +182,21 @@ def time_frame_from_arrays_sparse(self): ) +class From3rdParty: + # GH#44616 + + def setup_cache(self): + try: + import torch + except ImportError: + raise NotImplementedError + + row = 700000 + col = 64 + self.val_tensor = torch.randn(row, col) + + def time_from_torch(self): + DataFrame(self.val_tensor) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3cd787748738e..f06d64344d627 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -705,11 +705,16 @@ def __init__( # For data is list-like, or Iterable (will consume into list) elif is_list_like(data): if not isinstance(data, (abc.Sequence, ExtensionArray)): - data = list(data) + if hasattr(data, "__array__"): + # GH#44616 big perf improvement for e.g. pytorch tensor + data = np.asarray(data) + else: + data = list(data) if len(data) > 0: if is_dataclass(data[0]): data = dataclasses_to_dicts(data) - if treat_as_nested(data): + if not isinstance(data, np.ndarray) and treat_as_nested(data): + # exclude ndarray as we may have cast it a few lines above if columns is not None: # error: Argument 1 to "ensure_index" has incompatible type # "Collection[Any]"; expected "Union[Union[Union[ExtensionArray, diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 1972fbbe0f414..624ff2df64181 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -5,7 +5,7 @@ import subprocess import sys -import numpy as np # noqa:F401 needed in namespace for statsmodels +import numpy as np import pytest import pandas.util._test_decorators as td @@ -176,6 +176,17 @@ def test_pyarrow(df): tm.assert_frame_equal(result, df) +def test_torch_frame_construction(using_array_manager): + # GH#44616 + torch = import_module("torch") + val_tensor = torch.randn(700, 64) + + df = DataFrame(val_tensor) + + if not using_array_manager: + assert np.shares_memory(df, val_tensor) + + def test_yaml_dump(df): # GH#42748 yaml = import_module("yaml") From d8bb60e9c3b26a5fdfea18c8f2407dc2b505bb5f Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 21 Dec 2021 15:28:09 -0800 Subject: [PATCH 2/5] extend test for Series --- pandas/tests/test_downstream.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 624ff2df64181..3880b9ecd9da7 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -186,6 +186,9 @@ def test_torch_frame_construction(using_array_manager): if not using_array_manager: assert np.shares_memory(df, val_tensor) + ser = pd.Series(val_tensor[0]) + assert np.shares_memory(ser, val_tensor) + def test_yaml_dump(df): # GH#42748 From 559204a4c4ea7800fbae6dce72931c3327980261 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 21 Dec 2021 18:25:15 -0800 Subject: [PATCH 3/5] CI: add pytorch to a build --- ci/deps/actions-38-db.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/deps/actions-38-db.yaml b/ci/deps/actions-38-db.yaml index c08c642049b41..0157961a151e7 100644 --- a/ci/deps/actions-38-db.yaml +++ b/ci/deps/actions-38-db.yaml @@ -33,6 +33,7 @@ dependencies: - pyarrow>=1.0.1 - pymysql - pytables + - pytorch - python-snappy - python-dateutil - pytz From 569d19a90e85d5c32531c419de53711f8bcffca3 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 22 Dec 2021 16:37:01 -0800 Subject: [PATCH 4/5] whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index cdc0bbb1dfd6a..85442a876b988 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -601,6 +601,7 @@ Performance improvements - Performance improvement in :func:`to_csv` when :class:`MultiIndex` contains a lot of unused levels (:issue:`37484`) - Performance improvement in :func:`read_csv` when ``index_col`` was set with a numeric column (:issue:`44158`) - Performance improvement in :func:`concat` (:issue:`43354`) +- Performance improvement in constructing a :class:`DataFrame` from array-like objects like a ``Pytorch`` tensor (:issue:`44616`) - .. --------------------------------------------------------------------------- From 58222615c95a45cc89b0bcf0bdfa3cd11ea1644f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 22 Dec 2021 16:38:08 -0800 Subject: [PATCH 5/5] setup_cache->setup --- asv_bench/benchmarks/frame_ctor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 354dc5a8cea04..eace665ba0bac 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -185,7 +185,7 @@ def time_frame_from_arrays_sparse(self): class From3rdParty: # GH#44616 - def setup_cache(self): + def setup(self): try: import torch except ImportError: