stackabletech · adwk67 · Aug 10, 2022 · Aug 10, 2022 · Aug 11, 2022 · Aug 11, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,11 +4,19 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+### Added
+
+- Add Getting Started documentation ([#114]).
+
+[#114]: https://github.com/stackabletech/spark-k8s-operator/pull/114
+
 ### Fixed
 
 - Add missing role to read S3Connection and S3Bucket objects ([#112]).
+- Update annotation due to update to rust version ([#114]).
 
 [#112]: https://github.com/stackabletech/spark-k8s-operator/pull/112
+[#114]: https://github.com/stackabletech/spark-k8s-operator/pull/114
 
 ## [0.4.0] - 2022-08-03
 

diff --git a/docs/antora.yml b/docs/antora.yml
@@ -3,5 +3,6 @@ name: spark-k8s
 version: "nightly"
 title: Stackable Operator for Apache Spark on Kubernetes
 nav:
+  - modules/getting_started/nav.adoc
   - modules/ROOT/nav.adoc
 prerelease: true
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
@@ -1,4 +1,4 @@
-* xref:installation.adoc[]
+* xref:configuration.adoc[]
 * xref:usage.adoc[]
 * xref:job_dependencies.adoc[]
 * xref:rbac.adoc[]
diff --git a/docs/modules/ROOT/pages/installation.adoc b/docs/modules/ROOT/pages/installation.adoc
diff --git a/docs/modules/ROOT/pages/usage.adoc b/docs/modules/ROOT/pages/usage.adoc
@@ -1,32 +1,5 @@
 = Usage
 
-== Create an Apache Spark job
-
-If you followed the installation instructions, you should now have a Stackable Operator for Apache Spark up and running, and you are ready to create your first Apache Spark kubernetes cluster.
-
-The example below creates a job running on Apache Spark 3.3.0, using the spark-on-kubernetes paradigm described in the spark documentation. The application file is itself part of the spark distribution and `local` refers to the path on the driver/executors; there are no external dependencies.
-
-    cat <<EOF | kubectl apply -f -
-    apiVersion: spark.stackable.tech/v1alpha1
-    kind: SparkApplication
-    metadata:
-      name: spark-clustermode-001
-    spec:
-      version: 1.0
-      mode: cluster
-      mainClass: org.apache.spark.examples.SparkPi
-      mainApplicationFile: local:///stackable/spark/examples/jars/spark-examples_2.12-3.3.0.jar
-      image: 3.3.0-stackable0.1.0
-      driver:
-        cores: 1
-        coreLimit: "1200m"
-        memory: "512m"
-      executor:
-        cores: 1
-        instances: 3
-        memory: "512m"
-    EOF
-
 == Examples
 
 The following examples have the following `spec` fields in common:

diff --git a/docs/modules/getting_started/examples/code/getting_started.sh b/docs/modules/getting_started/examples/code/getting_started.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# This script contains all the code snippets from the guide, as well as some assert tests
+# to test if the instructions in the guide work. The user *could* use it, but it is intended
+# for testing only.
+# The script will install the operators, create a superset instance and briefly open a port
+# forward and connect to the superset instance to make sure it is up and running.
+# No running processes are left behind (i.e. the port-forwarding is closed at the end)
+
+if [ $# -eq 0 ]
+then
+  echo "Installation method argument ('helm' or 'stackablectl') required."
+  exit 1
+fi
+
+case "$1" in
+"helm")
+echo "Adding 'stackable-dev' Helm Chart repository"
+# tag::helm-add-repo[]
+helm repo add stackable-dev https://repo.stackable.tech/repository/helm-dev/
+# end::helm-add-repo[]
+echo "Installing Operators with Helm"
+# tag::helm-install-operators[]
+helm install --wait commons-operator stackable-dev/commons-operator --version 0.3.0-nightly
+helm install --wait secret-operator stackable-dev/secret-operator --version 0.6.0-nightly
+helm install --wait spark-k8s-operator stackable-dev/spark-k8s-operator --version 0.5.0-nightly
+# end::helm-install-operators[]
+;;
+"stackablectl")
+echo "installing Operators with stackablectl"
+# tag::stackablectl-install-operators[]
+stackablectl operator install \
+  commons=0.3.0-nightly \
+  secret=0.6.0-nightly \
+  spark-k8s=0.5.0-nightly
+# end::stackablectl-install-operators[]
+;;
+*)
+echo "Need to give 'helm' or 'stackablectl' as an argument for which installation method to use!"
+exit 1
+;;
+esac
+
+echo "Creating a Spark Application..."
+# tag::install-sparkapp[]
+kubectl apply -f pyspark-pi.yaml
+# end::install-sparkapp[]
+
+echo "Waiting for job to complete ..."
+# tag::wait-for-job[]
+kubectl wait pods -l 'job-name=pyspark-pi' \
+  --for jsonpath='{.status.phase}'=Succeeded \
+  --timeout 300s
+# end::wait-for-job[]
+
+result=$(kubectl logs -l 'spark-role=driver' --tail=-1 | grep "Pi is roughly")
+
+if [ "$result" == "" ]; then
+  echo "Log result was not found!"
+  exit 1
+else
+  echo "Job result: $result"
+fi
diff --git a/docs/modules/getting_started/examples/code/getting_started.sh.j2 b/docs/modules/getting_started/examples/code/getting_started.sh.j2
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# This script contains all the code snippets from the guide, as well as some assert tests
+# to test if the instructions in the guide work. The user *could* use it, but it is intended
+# for testing only.
+# The script will install the operators, create a superset instance and briefly open a port
+# forward and connect to the superset instance to make sure it is up and running.
+# No running processes are left behind (i.e. the port-forwarding is closed at the end)
+
+if [ $# -eq 0 ]
+then
+  echo "Installation method argument ('helm' or 'stackablectl') required."
+  exit 1
+fi
+
+case "$1" in
+"helm")
+echo "Adding 'stackable-dev' Helm Chart repository"
+# tag::helm-add-repo[]
+helm repo add stackable-dev https://repo.stackable.tech/repository/helm-dev/
+# end::helm-add-repo[]
+echo "Installing Operators with Helm"
+# tag::helm-install-operators[]
+helm install --wait commons-operator stackable-dev/commons-operator --version {{ versions.commons }}
+helm install --wait secret-operator stackable-dev/secret-operator --version {{ versions.secret }}
+helm install --wait spark-k8s-operator stackable-dev/spark-k8s-operator --version {{ versions.spark }}
+# end::helm-install-operators[]
+;;
+"stackablectl")
+echo "installing Operators with stackablectl"
+# tag::stackablectl-install-operators[]
+stackablectl operator install \
+  commons={{ versions.commons }} \
+  secret={{ versions.secret }} \
+  spark-k8s={{ versions.spark }}
+# end::stackablectl-install-operators[]
+;;
+*)
+echo "Need to give 'helm' or 'stackablectl' as an argument for which installation method to use!"
+exit 1
+;;
+esac
+
+echo "Creating a Spark Application..."
+# tag::install-sparkapp[]
+kubectl apply -f pyspark-pi.yaml
+# end::install-sparkapp[]
+
+echo "Waiting for job to complete ..."
+# tag::wait-for-job[]
+kubectl wait pods -l 'job-name=pyspark-pi' \
+  --for jsonpath='{.status.phase}'=Succeeded \
+  --timeout 300s
+# end::wait-for-job[]
+
+result=$(kubectl logs -l 'spark-role=driver' --tail=-1 | grep "Pi is roughly")
+
+if [ "$result" == "" ]; then
+  echo "Log result was not found!"
+  exit 1
+else
+  echo "Job result:" "$result"
+fi
diff --git a/docs/modules/getting_started/examples/code/pyspark-pi.yaml b/docs/modules/getting_started/examples/code/pyspark-pi.yaml
@@ -0,0 +1,19 @@
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkApplication
+metadata:
+  name: pyspark-pi
+  namespace: default
+spec:
+  version: "1.0"
+  sparkImage: docker.stackable.tech/stackable/pyspark-k8s:3.3.0-stackable0.1.0
+  mode: cluster
+  mainApplicationFile: local:///stackable/spark/examples/src/main/python/pi.py
+  driver:
+    cores: 1
+    coreLimit: "1200m"
+    memory: "512m"
+  executor:
+    cores: 1
+    instances: 3
+    memory: "512m"
diff --git a/docs/modules/getting_started/images/spark_complete.png b/docs/modules/getting_started/images/spark_complete.png
diff --git a/docs/modules/getting_started/images/spark_log.png b/docs/modules/getting_started/images/spark_log.png
diff --git a/docs/modules/getting_started/images/spark_running.png b/docs/modules/getting_started/images/spark_running.png
diff --git a/docs/modules/getting_started/nav.adoc b/docs/modules/getting_started/nav.adoc
@@ -0,0 +1,3 @@
+* xref:index.adoc[]
+** xref:installation.adoc[]
+** xref:first_steps.adoc[]
diff --git a/docs/modules/getting_started/pages/first_steps.adoc b/docs/modules/getting_started/pages/first_steps.adoc
@@ -0,0 +1,65 @@
+= First steps
+
+Once you have followed the steps in the xref:installation.adoc[] section to install the operator and its dependencies, you will now create a Spark job. Afterwards you can <<_verify_that_it_works, verify that it works>> by looking at the logs from the driver pod.
+
+== Starting a Spark job
+
+A Spark application is made of up three components:
+
+- Job: this will build a `spark-submit` command from the resource, passing this to internal spark code together with templates for building the driver and executor pods
+- Driver: the driver starts the designated number of executors and removes them when the job is completed.
+- Executor(s): responsible for executing the job itself
+
+Create a file named `pyspark-pi.yaml` with the following contents:
+
+[source,yaml]
+----
+include::example$code/pyspark-pi.yaml[]
+----
+
+And apply it:
+
+----
+include::example$code/getting_started.sh[tag=install-sparkapp]
+----
+
+Where:
+
+- `metadata.name` contains the name of the SparkApplication
+- `spec.version`: the current version is "1.0"
+- `spec.sparkImage`: the docker image that will be used by job, driver and executor pods. This can be provided by the user.
+- `spec.mode`: only `cluster` is currently supported
+- `spec.mainApplicationFile`: the artifact (Java, Scala or Python) that forms the basis of the Spark job. This path is relative to the image, so in this case we are running an example python script (that calculates the value of pi): it is bundled with the Spark code and therefore already present in the job image
+- `spec.driver`: driver-specific settings.
+- `spec.executor`: executor-specific settings.
+
+
+NOTE: If using Stackable image versions, please note that the version you need to specify for `spec.version` is not only the version of Spark which you want to roll out, but has to be amended with a Stackable version as shown. This Stackable version is the version of the underlying container image which is used to execute the processes. For a list of available versions please check our
+https://repo.stackable.tech/#browse/browse:docker:v2%2Fstackable%spark-k8s%2Ftags[image registry].
+It should generally be safe to simply use the latest image version that is available.
+
+This will create the `SparkApplication` that in turn creates the Spark job.
+
+== Verify that it works
+
+As mentioned above, the `SparkApplication` that has just been created will build a `spark-submit` command and pass it to the driver pod, which in turn will create executor pods that run for the duration of the job before being clean up. A running process will look like this:
+
+image::spark_running.png[Spark job]
+
+- `pyspark-pi-xxxx`: this is the initialising job that creates the spark-submit command (named as `metadata.name` with a unique suffix)
+- `pyspark-pi-xxxxxxx-driver`: the driver pod that drives the execution
+- `pythonpi-xxxxxxxxx-exec-x`: the set of executors started by the driver (in our example `spec.executor.instances` was set to 3 which is why we have 3 executors)
+
+Job progress can be followed by issuing this command:
+
+----
+include::example$code/getting_started.sh[tag=wait-for-job]
+----
+
+When the job completes the driver cleans up the executor. The initial job is persisted for several minutes before being removed. The completed state will look like this:
+
+image::spark_complete.png[Completed job]
+
+The driver logs can be inspected for more information about the results of the job. In this case we expect to find the results of our (approximate!) pi calculation:
+
+image::spark_log.png[Driver log]
diff --git a/docs/modules/getting_started/pages/index.adoc b/docs/modules/getting_started/pages/index.adoc
@@ -0,0 +1,18 @@
+= Getting started
+
+This guide will get you started with Spark using the Stackable Operator. It will guide you through the installation of the Operator and its dependencies, executing your first Spark job and reviewing its result.
+
+== Prerequisites
+
+You will need:
+
+* a Kubernetes cluster
+* kubectl
+* Helm
+
+== What's next
+
+The Guide is divided into two steps:
+
+* xref:installation.adoc[Installing the Operators].
+* xref:first_steps.adoc[Starting a Spark job].