added job wait and log parsing

adwk67 · adwk67 · commit 19179ea0ec97 · 2022-08-11T12:46:20.000+02:00
diff --git a/docs/modules/getting_started/examples/code/getting_started.sh b/docs/modules/getting_started/examples/code/getting_started.sh
@@ -42,9 +42,23 @@ exit 1
 ;;
 esac
 
-echo "Creating a Spark Application"
+echo "Creating a Spark Application..."
 # tag::install-sparkapp[]
 kubectl apply -f pyspark-pi.yaml
 # end::install-sparkapp[]
 
+echo "Waiting for job to complete ..."
+# tag::wait-for-job[]
+kubectl wait pods -l 'job-name=pyspark-pi' \
+  --for jsonpath='{.status.phase}'=Succeeded \
+  --timeout 300s
+# end::wait-for-job[]
 
+result=$(kubectl logs -l 'spark-role=driver' --tail=-1 | grep "Pi is roughly")
+
+if [ "$result" == "" ]; then
+  echo "Log result was not found!"
+  exit 1
+else
+  echo "Job result:" $result
+fi
diff --git a/docs/modules/getting_started/examples/code/getting_started.sh.j2 b/docs/modules/getting_started/examples/code/getting_started.sh.j2
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# This script contains all the code snippets from the guide, as well as some assert tests
+# to test if the instructions in the guide work. The user *could* use it, but it is intended
+# for testing only.
+# The script will install the operators, create a superset instance and briefly open a port
+# forward and connect to the superset instance to make sure it is up and running.
+# No running processes are left behind (i.e. the port-forwarding is closed at the end)
+
+if [ $# -eq 0 ]
+then
+  echo "Installation method argument ('helm' or 'stackablectl') required."
+  exit 1
+fi
+
+case "$1" in
+"helm")
+echo "Adding 'stackable-dev' Helm Chart repository"
+# tag::helm-add-repo[]
+helm repo add stackable-dev https://repo.stackable.tech/repository/helm-dev/
+# end::helm-add-repo[]
+echo "Installing Operators with Helm"
+# tag::helm-install-operators[]
+helm install --wait commons-operator stackable-dev/commons-operator --version {{ versions.commons }}
+helm install --wait secret-operator stackable-dev/secret-operator --version {{ versions.secret }}
+helm install --wait spark-k8s-operator stackable-dev/spark-k8s-operator --version {{ versions.spark }}
+# end::helm-install-operators[]
+;;
+"stackablectl")
+echo "installing Operators with stackablectl"
+# tag::stackablectl-install-operators[]
+stackablectl operator install \
+  commons={{ versions.commons }} \
+  secret={{ versions.secret }} \
+  spark-k8s={{ versions.spark }}
+# end::stackablectl-install-operators[]
+;;
+*)
+echo "Need to give 'helm' or 'stackablectl' as an argument for which installation method to use!"
+exit 1
+;;
+esac
+
+echo "Creating a Spark Application..."
+# tag::install-sparkapp[]
+kubectl apply -f pyspark-pi.yaml
+# end::install-sparkapp[]
+
+echo "Waiting for job to complete ..."
+# tag::wait-for-job[]
+kubectl wait pods -l 'job-name=pyspark-pi' \
+  --for jsonpath='{.status.phase}'=Succeeded \
+  --timeout 300s
+# end::wait-for-job[]
+
+result=$(kubectl logs -l 'spark-role=driver' --tail=-1 | grep "Pi is roughly")
+
+if [ "$result" == "" ]; then
+  echo "Log result was not found!"
+  exit 1
+else
+  echo "Job result:" $result
+fi
diff --git a/docs/modules/getting_started/pages/first_steps.adoc b/docs/modules/getting_started/pages/first_steps.adoc
@@ -2,13 +2,13 @@
 
 Once you have followed the steps in the xref:installation.adoc[] section to install the Operator and its dependencies, you will now create a Spark job. Afterwards you can <<_verify_that_it_works, verify that it works>> by looking at the logs from the driver pod.
 
-=== Airflow
+=== Starting a Spark job
 
-An Airflow cluster is made of up three components:
+A SparkApplication is made of up three components:
 
-- `webserver`: this provides the main UI for user-interaction
-- `workers`: the nodes over which the job workload will be distributed by the scheduler
-- `scheduler`: responsible for triggering jobs and persisting their metadata to the backend database
+- Job: this will build a spark-submit command from the resource, passing this to internal spark code together with templates for building the driver and executor pods
+- Driver: the driver starts the designated number of executors and removes them when the job is completed.
+- Executor(s): responsible for executing the job itself
 
 Create a file named `pyspark-pi.yaml` with the following contents:
 
@@ -50,6 +50,12 @@ image::spark_running.png[Spark job]
 - `pyspark-pi-xxxxxxx-driver`: the driver pod that drives the execution
 - `pythonpi-xxxxxxxxx-exec-x`: the set of executors started by the driver (in our example `spec.executor.instances` was set to 3 which is why we have 3 executors)
 
+Job progress can be followed by issuing this command:
+
+----
+include::example$code/getting_started.sh[tag=wait-for-job]
+----
+
 When the job completes the driver cleans up the executor. The initial job is persisted for several minutes before being removed. The completed state will look like this:
 
 image::spark_complete.png[Completed job]
diff --git a/docs/templating_vars.yaml b/docs/templating_vars.yaml
@@ -5,4 +5,4 @@ helm:
 versions:
   commons: 0.3.0-nightly
   secret: 0.6.0-nightly
-  airflow: 0.5.0-nightly
+  spark: 0.5.0-nightly