From 7a8dcf0d5680745ef55e01a9446506349605546e Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 31 Oct 2023 11:36:11 +0000 Subject: [PATCH 01/10] wip: add notes on execution model --- .../dataframe_api/column_object.py | 5 + .../dataframe_api/dataframe_object.py | 23 +++++ spec/design_topics/execution_model.md | 97 +++++++++++++++++++ spec/design_topics/index.rst | 1 + 4 files changed, 126 insertions(+) create mode 100644 spec/design_topics/execution_model.md diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 0bccf2d0..16e6336d 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -802,6 +802,11 @@ def to_array(self) -> Any: may choose to return a numpy array (for numpy prior to 2.0), with the understanding that consuming libraries would then use the ``array-api-compat`` package to convert it to a Standard-compliant array. + + Notes + ----- + To be guaranteed to run across all implementations, :meth:`may_execute` should + be executed at some point before calling this method. """ ... diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index f5c17d80..ae4b36ad 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -64,6 +64,11 @@ def dataframe(self) -> SupportsDataFrameAPI: def shape(self) -> tuple[int, int]: """ Return number of rows and number of columns. + + Notes + ----- + To be guaranteed to run across all implementations, :meth:`may_execute` should + be executed at some point before calling this method. """ ... @@ -928,6 +933,9 @@ def to_array(self, dtype: DType) -> Any: may choose to return a numpy array (for numpy prior to 2.0), with the understanding that consuming libraries would then use the ``array-api-compat`` package to convert it to a Standard-compliant array. + + To be guaranteed to run across all implementations, :meth:`may_execute` should + be executed at some point before calling this method. """ def join( @@ -972,3 +980,18 @@ def join( present in both `self` and `other`. """ ... + + def may_execute(self) -> Self: + """ + Hint that execution may be triggered, depending on the implementation. + + This is intended as a hint, rather than as a directive. Implementations + which do not separate lazy vs eager execution may ignore this method and + treat it as a no-op. Likewise for implementations which support automated + execution. + + .. note:: + This method may force execution. If necessary, it should be called + at most once per dataframe, and as late as possible in the pipeline. + """ + ... diff --git a/spec/design_topics/execution_model.md b/spec/design_topics/execution_model.md new file mode 100644 index 00000000..103a794c --- /dev/null +++ b/spec/design_topics/execution_model.md @@ -0,0 +1,97 @@ +# Execution model + +The vast majority of the Dataframe API is designed to be agnostic of the +underlying execution model. + +However, there are some methods which, depending on the implementation, may +not be supported in some cases. + +For example, let's consider the following: +```python +df: DataFrame +features = [] +for column_name in df.column_names: + if df.col(column_name).std() > 0: + features.append(column_name) +return features +``` +If `df` is a lazy dataframe, then the call `df.col(column_name).std() > 0` returns +a (ducktyped) Python boolean scalar. No issues so far. Problem is, +what happens when `if df.col(column_name).std() > 0` is called? + +Under the hood, Python will call `(df.col(column_name).std() > 0).__bool__()` in +order to extract a Python boolean. This is a problem for "lazy" implementations, +as the laziness needs breaking in order to evaluate the above. + +Dask and Polars both require that `.compute` (resp. `.collect`) be called beforehand +for such an operation to be executed: + ```python + In [1]: import dask.dataframe as dd + + In [2]: pandas_df = pd.DataFrame({"x": [1, 2, 3], "y": 1}) + + In [3]: df = dd.from_pandas(pandas_df, npartitions=2) + + In [4]: scalar = df.x.std() > 0 + + In [5]: if scalar: + ...: print('scalar is positive') + ...: + --------------------------------------------------------------------------- + TypeError Traceback (most recent call last) + Cell In[5], line 1 + ----> 1 if scalar: + 2 print('scalar is positive') + + File ~/tmp/.venv/lib/python3.10/site-packages/dask/dataframe/core.py:312, in Scalar.__bool__(self) + 311 def __bool__(self): + --> 312 raise TypeError( + 313 f"Trying to convert {self} to a boolean value. Because Dask objects are " + 314 "lazily evaluated, they cannot be converted to a boolean value or used " + 315 "in boolean conditions like if statements. Try calling .compute() to " + 316 "force computation prior to converting to a boolean value or using in " + 317 "a conditional statement." + 318 ) + + TypeError: Trying to convert dd.Scalar to a boolean value. Because Dask objects are lazily evaluated, they cannot be converted to a boolean value or used in boolean conditions like if statements. Try calling .compute() to force computation prior to converting to a boolean value or using in a conditional statement. + ``` + +Exactly which methods require computation may vary across implementations. Some may +implicitly do it for users under-the-hood for certain methods, whereas others require +the user to explicitly trigger it. + +Therefore, the Dataframe API has a `Dataframe.maybe_evaluate` method. This is to be +interpreted as a hint, rather than as a directive - the implementation itself may decide +whether to force execution at this step, or whether to defer it to later. + +Operations which require `DataFrame.may_execute` to have been called at some prior +point are: +- `DataFrame.to_array` +- `DataFrame.shape` +- `Column.to_array` +- calling `bool`, `int`, or `float` on a scalar + +Therefore, the Standard-compliant way to write the code above is: +```python +df: DataFrame +df = df.may_execute() +features = [] +for column_name in df.column_names: + if df.col(column_name).std() > 0: + features.append(column_name) +return features +``` + +Note now `DataFrame.may_execute` is called only once, and as late as possible. +Conversely, the "wrong" way to execute the above would be: + +```python +df: DataFrame +features = [] +for column_name in df.column_names: + # Do NOT do this! + if df.may_execute().col(column_name).std() > 0: + features.append(column_name) +return features +``` +as that will potentially re-trigger the same execution multiple times. diff --git a/spec/design_topics/index.rst b/spec/design_topics/index.rst index 023da1a9..9f11ca89 100644 --- a/spec/design_topics/index.rst +++ b/spec/design_topics/index.rst @@ -8,3 +8,4 @@ Design topics & constraints backwards_compatibility data_interchange python_builtin_types + execution_model From 0f4188bdf7f08fd16f8ddd1cdab918408f3e6ee3 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 31 Oct 2023 15:58:49 +0000 Subject: [PATCH 02/10] reword --- .../dataframe_api/dataframe_object.py | 2 +- spec/design_topics/execution_model.md | 47 +++++++------------ 2 files changed, 18 insertions(+), 31 deletions(-) diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index ae4b36ad..93200735 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -981,7 +981,7 @@ def join( """ ... - def may_execute(self) -> Self: + def maybe_execute(self) -> Self: """ Hint that execution may be triggered, depending on the implementation. diff --git a/spec/design_topics/execution_model.md b/spec/design_topics/execution_model.md index 103a794c..afaea402 100644 --- a/spec/design_topics/execution_model.md +++ b/spec/design_topics/execution_model.md @@ -38,40 +38,13 @@ for such an operation to be executed: ...: print('scalar is positive') ...: --------------------------------------------------------------------------- - TypeError Traceback (most recent call last) - Cell In[5], line 1 - ----> 1 if scalar: - 2 print('scalar is positive') - - File ~/tmp/.venv/lib/python3.10/site-packages/dask/dataframe/core.py:312, in Scalar.__bool__(self) - 311 def __bool__(self): - --> 312 raise TypeError( - 313 f"Trying to convert {self} to a boolean value. Because Dask objects are " - 314 "lazily evaluated, they cannot be converted to a boolean value or used " - 315 "in boolean conditions like if statements. Try calling .compute() to " - 316 "force computation prior to converting to a boolean value or using in " - 317 "a conditional statement." - 318 ) + [...] TypeError: Trying to convert dd.Scalar to a boolean value. Because Dask objects are lazily evaluated, they cannot be converted to a boolean value or used in boolean conditions like if statements. Try calling .compute() to force computation prior to converting to a boolean value or using in a conditional statement. ``` -Exactly which methods require computation may vary across implementations. Some may -implicitly do it for users under-the-hood for certain methods, whereas others require -the user to explicitly trigger it. - -Therefore, the Dataframe API has a `Dataframe.maybe_evaluate` method. This is to be -interpreted as a hint, rather than as a directive - the implementation itself may decide -whether to force execution at this step, or whether to defer it to later. - -Operations which require `DataFrame.may_execute` to have been called at some prior -point are: -- `DataFrame.to_array` -- `DataFrame.shape` -- `Column.to_array` -- calling `bool`, `int`, or `float` on a scalar - -Therefore, the Standard-compliant way to write the code above is: +The Dataframe API has a `DataFrame.maybe_evaluate` for addressing the above. We can use it to rewrite the code above +as follows: ```python df: DataFrame df = df.may_execute() @@ -82,6 +55,20 @@ for column_name in df.column_names: return features ``` +Note that `maybe_evaluate` is to be interpreted as a hint, rather than as a directive - +the implementation itself may decide +whether to force execution at this step, or whether to defer it to later. +For example, a dataframe which can convert to a lazy array could decide to ignore +`maybe_evaluate` when evaluting `DataFrame.to_array` but to respect it when evaluating +`float(Column.std())`. + +Operations which require `DataFrame.may_execute` to have been called at some prior +point are: +- `DataFrame.to_array` +- `DataFrame.shape` +- `Column.to_array` +- calling `bool`, `int`, or `float` on a scalar + Note now `DataFrame.may_execute` is called only once, and as late as possible. Conversely, the "wrong" way to execute the above would be: From 1dd4678b11716c2669a3d3a08338a9e480c933d7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 31 Oct 2023 16:01:25 +0000 Subject: [PATCH 03/10] remove column mentions for now --- spec/API_specification/dataframe_api/column_object.py | 5 ----- spec/API_specification/dataframe_api/dataframe_object.py | 4 ++-- spec/design_topics/execution_model.md | 9 ++++----- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 16e6336d..0bccf2d0 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -802,11 +802,6 @@ def to_array(self) -> Any: may choose to return a numpy array (for numpy prior to 2.0), with the understanding that consuming libraries would then use the ``array-api-compat`` package to convert it to a Standard-compliant array. - - Notes - ----- - To be guaranteed to run across all implementations, :meth:`may_execute` should - be executed at some point before calling this method. """ ... diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 93200735..478d7b65 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -67,7 +67,7 @@ def shape(self) -> tuple[int, int]: Notes ----- - To be guaranteed to run across all implementations, :meth:`may_execute` should + To be guaranteed to run across all implementations, :meth:`maybe_execute` should be executed at some point before calling this method. """ ... @@ -934,7 +934,7 @@ def to_array(self, dtype: DType) -> Any: understanding that consuming libraries would then use the ``array-api-compat`` package to convert it to a Standard-compliant array. - To be guaranteed to run across all implementations, :meth:`may_execute` should + To be guaranteed to run across all implementations, :meth:`maybe_execute` should be executed at some point before calling this method. """ diff --git a/spec/design_topics/execution_model.md b/spec/design_topics/execution_model.md index afaea402..1650de33 100644 --- a/spec/design_topics/execution_model.md +++ b/spec/design_topics/execution_model.md @@ -47,7 +47,7 @@ The Dataframe API has a `DataFrame.maybe_evaluate` for addressing the above. We as follows: ```python df: DataFrame -df = df.may_execute() +df = df.maybe_execute() features = [] for column_name in df.column_names: if df.col(column_name).std() > 0: @@ -62,14 +62,13 @@ For example, a dataframe which can convert to a lazy array could decide to ignor `maybe_evaluate` when evaluting `DataFrame.to_array` but to respect it when evaluating `float(Column.std())`. -Operations which require `DataFrame.may_execute` to have been called at some prior +Operations which require `DataFrame.maybe_execute` to have been called at some prior point are: - `DataFrame.to_array` - `DataFrame.shape` -- `Column.to_array` - calling `bool`, `int`, or `float` on a scalar -Note now `DataFrame.may_execute` is called only once, and as late as possible. +Note now `DataFrame.maybe_execute` is called only once, and as late as possible. Conversely, the "wrong" way to execute the above would be: ```python @@ -77,7 +76,7 @@ df: DataFrame features = [] for column_name in df.column_names: # Do NOT do this! - if df.may_execute().col(column_name).std() > 0: + if df.maybe_execute().col(column_name).std() > 0: features.append(column_name) return features ``` From e4f47c7abec79f3cc02eb485bc3a270e75a92871 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 7 Nov 2023 11:39:34 +0000 Subject: [PATCH 04/10] remove to_array --- .../dataframe_api/dataframe_object.py | 7 ++----- spec/design_topics/execution_model.md | 17 +++++++---------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 478d7b65..0a483fce 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -67,7 +67,7 @@ def shape(self) -> tuple[int, int]: Notes ----- - To be guaranteed to run across all implementations, :meth:`maybe_execute` should + To be guaranteed to run across all implementations, :meth:`may_execute` should be executed at some point before calling this method. """ ... @@ -933,9 +933,6 @@ def to_array(self, dtype: DType) -> Any: may choose to return a numpy array (for numpy prior to 2.0), with the understanding that consuming libraries would then use the ``array-api-compat`` package to convert it to a Standard-compliant array. - - To be guaranteed to run across all implementations, :meth:`maybe_execute` should - be executed at some point before calling this method. """ def join( @@ -981,7 +978,7 @@ def join( """ ... - def maybe_execute(self) -> Self: + def may_execute(self) -> Self: """ Hint that execution may be triggered, depending on the implementation. diff --git a/spec/design_topics/execution_model.md b/spec/design_topics/execution_model.md index 1650de33..07d76432 100644 --- a/spec/design_topics/execution_model.md +++ b/spec/design_topics/execution_model.md @@ -43,11 +43,11 @@ for such an operation to be executed: TypeError: Trying to convert dd.Scalar to a boolean value. Because Dask objects are lazily evaluated, they cannot be converted to a boolean value or used in boolean conditions like if statements. Try calling .compute() to force computation prior to converting to a boolean value or using in a conditional statement. ``` -The Dataframe API has a `DataFrame.maybe_evaluate` for addressing the above. We can use it to rewrite the code above +The Dataframe API has a `DataFrame.may_execute` for addressing the above. We can use it to rewrite the code above as follows: ```python df: DataFrame -df = df.maybe_execute() +df = df.may_execute() features = [] for column_name in df.column_names: if df.col(column_name).std() > 0: @@ -55,20 +55,17 @@ for column_name in df.column_names: return features ``` -Note that `maybe_evaluate` is to be interpreted as a hint, rather than as a directive - +Note that `may_execute` is to be interpreted as a hint, rather than as a directive - the implementation itself may decide whether to force execution at this step, or whether to defer it to later. -For example, a dataframe which can convert to a lazy array could decide to ignore -`maybe_evaluate` when evaluting `DataFrame.to_array` but to respect it when evaluating -`float(Column.std())`. -Operations which require `DataFrame.maybe_execute` to have been called at some prior +Operations which require `DataFrame.may_execute` to have been called at some prior point are: -- `DataFrame.to_array` - `DataFrame.shape` - calling `bool`, `int`, or `float` on a scalar -Note now `DataFrame.maybe_execute` is called only once, and as late as possible. +Note how in the above example, `DataFrame.may_execute` was called only once, +and as late as possible. Conversely, the "wrong" way to execute the above would be: ```python @@ -76,7 +73,7 @@ df: DataFrame features = [] for column_name in df.column_names: # Do NOT do this! - if df.maybe_execute().col(column_name).std() > 0: + if df.may_execute().col(column_name).std() > 0: features.append(column_name) return features ``` From b6b648b0f504e2fb05168a97baa5ce72d783f5b3 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 8 Nov 2023 11:47:51 +0000 Subject: [PATCH 05/10] use persist instead --- .../dataframe_api/dataframe_object.py | 4 +-- spec/design_topics/execution_model.md | 32 +++++++++++++++---- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 0a483fce..bcd9c411 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -67,7 +67,7 @@ def shape(self) -> tuple[int, int]: Notes ----- - To be guaranteed to run across all implementations, :meth:`may_execute` should + To be guaranteed to run across all implementations, :meth:`persist` should be executed at some point before calling this method. """ ... @@ -978,7 +978,7 @@ def join( """ ... - def may_execute(self) -> Self: + def persist(self) -> Self: """ Hint that execution may be triggered, depending on the implementation. diff --git a/spec/design_topics/execution_model.md b/spec/design_topics/execution_model.md index 07d76432..b569c751 100644 --- a/spec/design_topics/execution_model.md +++ b/spec/design_topics/execution_model.md @@ -1,5 +1,7 @@ # Execution model +## Scope + The vast majority of the Dataframe API is designed to be agnostic of the underlying execution model. @@ -43,11 +45,13 @@ for such an operation to be executed: TypeError: Trying to convert dd.Scalar to a boolean value. Because Dask objects are lazily evaluated, they cannot be converted to a boolean value or used in boolean conditions like if statements. Try calling .compute() to force computation prior to converting to a boolean value or using in a conditional statement. ``` -The Dataframe API has a `DataFrame.may_execute` for addressing the above. We can use it to rewrite the code above +## Solution: DataFrame.persist + +The Dataframe API has a `DataFrame.persist` for addressing the above. We can use it to rewrite the code above as follows: ```python df: DataFrame -df = df.may_execute() +df = df.persist() features = [] for column_name in df.column_names: if df.col(column_name).std() > 0: @@ -55,16 +59,17 @@ for column_name in df.column_names: return features ``` -Note that `may_execute` is to be interpreted as a hint, rather than as a directive - +Note that `persist` is to be interpreted as a hint, rather than as a directive - the implementation itself may decide -whether to force execution at this step, or whether to defer it to later. +whether to force execution at this step, cache results, defer it to later and cache +results when they reach the current node, or ignore it. -Operations which require `DataFrame.may_execute` to have been called at some prior +Operations which require `DataFrame.persist` to have been called at some prior point are: - `DataFrame.shape` - calling `bool`, `int`, or `float` on a scalar -Note how in the above example, `DataFrame.may_execute` was called only once, +Note how in the above example, `DataFrame.persist` was called only once, and as late as possible. Conversely, the "wrong" way to execute the above would be: @@ -73,8 +78,21 @@ df: DataFrame features = [] for column_name in df.column_names: # Do NOT do this! - if df.may_execute().col(column_name).std() > 0: + if df.persist().col(column_name).std() > 0: features.append(column_name) return features ``` as that will potentially re-trigger the same execution multiple times. + +## Propagation + +Propagation of "persistedness" is outside the scope of the Standard. Nonetheless, +some implementations may choose to disallow patterns such as +```python +df_raw: SupportDataFrameAPI +df = df_raw.__dataframe_consortium_standard__(api_version='2023.09-beta') +if df.col('a').mean() > 0: # raises, call `.persist` beforehand + do_something() +``` +and instead "force" users to call `.persist` at some point between `__dataframe_consortium_standard__` +and an operation which requires execution. From 6d5a59989f6b432c1765f443fb3b5899c81ff3c7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 8 Nov 2023 17:08:00 +0000 Subject: [PATCH 06/10] remove note on propagation --- .../dataframe_api/dataframe_object.py | 29 ++++++++++++++++--- spec/design_topics/execution_model.md | 13 --------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index bcd9c411..297eb136 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -980,15 +980,36 @@ def join( def persist(self) -> Self: """ - Hint that execution may be triggered, depending on the implementation. + Hint to that computation prior to this point should not be repeated. This is intended as a hint, rather than as a directive. Implementations which do not separate lazy vs eager execution may ignore this method and - treat it as a no-op. Likewise for implementations which support automated - execution. + treat it as a no-op. .. note:: - This method may force execution. If necessary, it should be called + This method may trigger execution. If necessary, it should be called at most once per dataframe, and as late as possible in the pipeline. + + For example, do this + + .. code-block:: python + + df: DataFrame + df = df.persist() + features = [] + for column_name in df.column_names: + if df.col(column_name).std() > 0: + features.append(column_name) + + instead of this: + + .. code-block:: python + + df: DataFrame + features = [] + for column_name in df.column_names: + # Do NOT do this! + if df.persist().col(column_name).std() > 0: + features.append(column_name) """ ... diff --git a/spec/design_topics/execution_model.md b/spec/design_topics/execution_model.md index b569c751..24c8b9f4 100644 --- a/spec/design_topics/execution_model.md +++ b/spec/design_topics/execution_model.md @@ -83,16 +83,3 @@ for column_name in df.column_names: return features ``` as that will potentially re-trigger the same execution multiple times. - -## Propagation - -Propagation of "persistedness" is outside the scope of the Standard. Nonetheless, -some implementations may choose to disallow patterns such as -```python -df_raw: SupportDataFrameAPI -df = df_raw.__dataframe_consortium_standard__(api_version='2023.09-beta') -if df.col('a').mean() > 0: # raises, call `.persist` beforehand - do_something() -``` -and instead "force" users to call `.persist` at some point between `__dataframe_consortium_standard__` -and an operation which requires execution. From 6cef5694f6de3defae062f53d6fea3458533c785 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 8 Nov 2023 17:11:24 +0000 Subject: [PATCH 07/10] update purpose and scope --- spec/purpose_and_scope.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spec/purpose_and_scope.md b/spec/purpose_and_scope.md index dfc6d138..8ee830a4 100644 --- a/spec/purpose_and_scope.md +++ b/spec/purpose_and_scope.md @@ -125,9 +125,10 @@ See the [use cases](use_cases.md) section for details on the exact use cases con Implementation details of the dataframes and execution of operations. This includes: - How data is represented and stored (whether the data is in memory, disk, distributed) -- Expectations on when the execution is happening (in an eager or lazy way) +- Expectations on when the execution is happening (in an eager or lazy way), other than `DataFrame.persist` - Other execution details + **Rationale:** The API defined in this document needs to be used by libraries as diverse as Ibis, Dask, Vaex or cuDF. The data can live in databases, distributed systems, disk or GPU memory. Any decision that involves assumptions on where the data is stored, or where execution happens From 4bf81c2959617361dc3272a3cd3243cd95008920 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 8 Nov 2023 18:24:30 +0000 Subject: [PATCH 08/10] reduce execution_model --- .../dataframe_api/dataframe_object.py | 9 ++--- spec/design_topics/execution_model.md | 40 +------------------ 2 files changed, 5 insertions(+), 44 deletions(-) diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index fb162fe6..6b2116d2 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -62,9 +62,7 @@ def dataframe(self) -> SupportsDataFrameAPI: ... def shape(self) -> tuple[int, int]: - """ - Return number of rows and number of columns. - """ + """Return number of rows and number of columns.""" ... def group_by(self, *keys: str) -> GroupBy: @@ -931,10 +929,9 @@ def join( present in both `self` and `other`. """ ... - + def persist(self) -> Self: - """ - Hint to that computation prior to this point should not be repeated. + """Hint to that computation prior to this point should not be repeated. This is intended as a hint, rather than as a directive. Implementations which do not separate lazy vs eager execution may ignore this method and diff --git a/spec/design_topics/execution_model.md b/spec/design_topics/execution_model.md index 24c8b9f4..c81c7767 100644 --- a/spec/design_topics/execution_model.md +++ b/spec/design_topics/execution_model.md @@ -45,41 +45,5 @@ for such an operation to be executed: TypeError: Trying to convert dd.Scalar to a boolean value. Because Dask objects are lazily evaluated, they cannot be converted to a boolean value or used in boolean conditions like if statements. Try calling .compute() to force computation prior to converting to a boolean value or using in a conditional statement. ``` -## Solution: DataFrame.persist - -The Dataframe API has a `DataFrame.persist` for addressing the above. We can use it to rewrite the code above -as follows: -```python -df: DataFrame -df = df.persist() -features = [] -for column_name in df.column_names: - if df.col(column_name).std() > 0: - features.append(column_name) -return features -``` - -Note that `persist` is to be interpreted as a hint, rather than as a directive - -the implementation itself may decide -whether to force execution at this step, cache results, defer it to later and cache -results when they reach the current node, or ignore it. - -Operations which require `DataFrame.persist` to have been called at some prior -point are: -- `DataFrame.shape` -- calling `bool`, `int`, or `float` on a scalar - -Note how in the above example, `DataFrame.persist` was called only once, -and as late as possible. -Conversely, the "wrong" way to execute the above would be: - -```python -df: DataFrame -features = [] -for column_name in df.column_names: - # Do NOT do this! - if df.persist().col(column_name).std() > 0: - features.append(column_name) -return features -``` -as that will potentially re-trigger the same execution multiple times. +Whether such computation succeeds or raises is currently not defined by the Standard and may vary across +implementations. From 305a44b038a5293f2591035c37606fe174a557b8 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 10 Nov 2023 16:52:25 +0000 Subject: [PATCH 09/10] Update spec/API_specification/dataframe_api/dataframe_object.py Co-authored-by: Ralf Gommers --- spec/API_specification/dataframe_api/dataframe_object.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 6b2116d2..142090e8 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -931,7 +931,7 @@ def join( ... def persist(self) -> Self: - """Hint to that computation prior to this point should not be repeated. + """Hint that computation prior to this point should not be repeated. This is intended as a hint, rather than as a directive. Implementations which do not separate lazy vs eager execution may ignore this method and From e0b7458339b5aa2330fa58838cea885ad952a038 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 10 Nov 2023 16:57:08 +0000 Subject: [PATCH 10/10] Update spec/purpose_and_scope.md --- spec/purpose_and_scope.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/purpose_and_scope.md b/spec/purpose_and_scope.md index 8ee830a4..f09a2e25 100644 --- a/spec/purpose_and_scope.md +++ b/spec/purpose_and_scope.md @@ -125,7 +125,7 @@ See the [use cases](use_cases.md) section for details on the exact use cases con Implementation details of the dataframes and execution of operations. This includes: - How data is represented and stored (whether the data is in memory, disk, distributed) -- Expectations on when the execution is happening (in an eager or lazy way), other than `DataFrame.persist` +- Expectations on when the execution is happening (in an eager or lazy way) (see `execution model` for some caveats) - Other execution details