initial commit: wip

adwk67 · adwk67 · commit 4f56431af5eb · 2022-08-10T17:30:08.000+02:00
diff --git a/docs/antora.yml b/docs/antora.yml
@@ -3,5 +3,6 @@ name: spark-k8s
 version: "nightly"
 title: Stackable Operator for Apache Spark on Kubernetes
 nav:
+  - modules/getting_started/nav.adoc
   - modules/ROOT/nav.adoc
 prerelease: true
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
@@ -1,4 +1,4 @@
-* xref:installation.adoc[]
+* xref:configuration.adoc[]
 * xref:usage.adoc[]
 * xref:job_dependencies.adoc[]
 * xref:rbac.adoc[]
diff --git a/docs/modules/ROOT/pages/installation.adoc b/docs/modules/ROOT/pages/installation.adoc
diff --git a/docs/modules/ROOT/pages/usage.adoc b/docs/modules/ROOT/pages/usage.adoc
@@ -1,32 +1,5 @@
 = Usage
 
-== Create an Apache Spark job
-
-If you followed the installation instructions, you should now have a Stackable Operator for Apache Spark up and running, and you are ready to create your first Apache Spark kubernetes cluster.
-
-The example below creates a job running on Apache Spark 3.3.0, using the spark-on-kubernetes paradigm described in the spark documentation. The application file is itself part of the spark distribution and `local` refers to the path on the driver/executors; there are no external dependencies.
-
-    cat <<EOF | kubectl apply -f -
-    apiVersion: spark.stackable.tech/v1alpha1
-    kind: SparkApplication
-    metadata:
-      name: spark-clustermode-001
-    spec:
-      version: 1.0
-      mode: cluster
-      mainClass: org.apache.spark.examples.SparkPi
-      mainApplicationFile: local:///stackable/spark/examples/jars/spark-examples_2.12-3.3.0.jar
-      image: 3.3.0-stackable0.1.0
-      driver:
-        cores: 1
-        coreLimit: "1200m"
-        memory: "512m"
-      executor:
-        cores: 1
-        instances: 3
-        memory: "512m"
-    EOF
-
 == Examples
 
 The following examples have the following `spec` fields in common:
diff --git a/docs/modules/getting_started/examples/code/getting_started.sh b/docs/modules/getting_started/examples/code/getting_started.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# This script contains all the code snippets from the guide, as well as some assert tests
+# to test if the instructions in the guide work. The user *could* use it, but it is intended
+# for testing only.
+# The script will install the operators, create a superset instance and briefly open a port
+# forward and connect to the superset instance to make sure it is up and running.
+# No running processes are left behind (i.e. the port-forwarding is closed at the end)
+
+if [ $# -eq 0 ]
+then
+  echo "Installation method argument ('helm' or 'stackablectl') required."
+  exit 1
+fi
+
+case "$1" in
+"helm")
+echo "Adding 'stackable-dev' Helm Chart repository"
+# tag::helm-add-repo[]
+helm repo add stackable-dev https://repo.stackable.tech/repository/helm-dev/
+# end::helm-add-repo[]
+echo "Installing Operators with Helm"
+# tag::helm-install-operators[]
+helm install --wait commons-operator stackable-dev/commons-operator --version 0.3.0-nightly
+helm install --wait secret-operator stackable-dev/secret-operator --version 0.6.0-nightly
+helm install --wait spark-k8s-operator stackable-dev/spark-k8s-operator --version 0.5.0-nightly
+# end::helm-install-operators[]
+;;
+"stackablectl")
+echo "installing Operators with stackablectl"
+# tag::stackablectl-install-operators[]
+stackablectl operator install \
+  commons=0.3.0-nightly \
+  secret=0.6.0-nightly \
+  spark-k8s=0.5.0-nightly
+# end::stackablectl-install-operators[]
+;;
+*)
+echo "Need to give 'helm' or 'stackablectl' as an argument for which installation method to use!"
+exit 1
+;;
+esac
+
+echo "Creating a Spark Application"
+# tag::install-sparkapp[]
+kubectl apply -f pyspark-pi.yaml
+# end::install-sparkapp[]
+
+echo "Waiting on AirflowDB ..."
+# tag::wait-airflowdb[]
+kubectl wait airflowdb/airflow \
+  --for jsonpath='{.status.condition}'=Ready \
+  --timeout 300s
+# end::wait-airflowdb[]
+
+sleep 5
+
+echo "Awaiting Airflow rollout finish"
+# tag::watch-airflow-rollout[]
+kubectl rollout status --watch statefulset/airflow-webserver-default
+kubectl rollout status --watch statefulset/airflow-worker-default
+kubectl rollout status --watch statefulset/airflow-scheduler-default
+# end::watch-airflow-rollout[]
+
+echo "Starting port-forwarding of port 8080"
+# tag::port-forwarding[]
+kubectl port-forward svc/airflow-webserver 8080 2>&1 >/dev/null &
+# end::port-forwarding[]
+PORT_FORWARD_PID=$!
+trap "kill $PORT_FORWARD_PID" EXIT
+sleep 5
+
+echo "Checking if web interface is reachable ..."
+return_code=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/login/)
+if [ "$return_code" == 200 ]; then
+  echo "Webserver UI reachable!"
+else
+  echo "Could not reach Webserver UI."
+  exit 1
+fi
diff --git a/docs/modules/getting_started/examples/code/pyspark-pi.yaml b/docs/modules/getting_started/examples/code/pyspark-pi.yaml
@@ -0,0 +1,19 @@
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkApplication
+metadata:
+  name: pyspark-pi
+  namespace: default
+spec:
+  version: "1.0"
+  sparkImage: docker.stackable.tech/stackable/pyspark-k8s:3.3.0-stackable0.1.0
+  mode: cluster
+  mainApplicationFile: local:///stackable/spark/examples/src/main/python/pi.py
+  driver:
+    cores: 1
+    coreLimit: "1200m"
+    memory: "512m"
+  executor:
+    cores: 1
+    instances: 3
+    memory: "512m"
diff --git a/docs/modules/getting_started/nav.adoc b/docs/modules/getting_started/nav.adoc
@@ -0,0 +1,3 @@
+** xref:index.adoc[]
+** xref:installation.adoc[]
+** xref:first_steps.adoc[]
diff --git a/docs/modules/getting_started/pages/first_steps.adoc b/docs/modules/getting_started/pages/first_steps.adoc
@@ -0,0 +1,114 @@
+= First steps
+
+Once you have followed the steps in the xref:installation.adoc[] section to install the Operator and its dependencies, you will now create a Spark job. Afterwards you can <<_verify_that_it_works, verify that it works>> by looking at the logs from the driver pod.
+
+=== Airflow
+
+An Airflow cluster is made of up three components:
+
+- `webserver`: this provides the main UI for user-interaction
+- `workers`: the nodes over which the job workload will be distributed by the scheduler
+- `scheduler`: responsible for triggering jobs and persisting their metadata to the backend database
+
+Create a file named `pyspark-pi.yaml` with the following contents:
+
+[source,yaml]
+----
+include::example$code/pyspark-pi.yaml[]
+----
+
+And apply it:
+
+----
+include::example$code/getting_started.sh[tag=install-sparkapp]
+----
+
+Where:
+
+- `metadata.name` contains the name of the SparkApplication
+- `spec.version`: the current version is "1.0"
+- `spec.sparkImage`: the docker image that will be used by job, driver and executor pods. This can be provided by the user.
+- `spec.mode`: only `cluster` is currently supported
+- `spec.mainApplicationFile`: the artifact (Java, Scala or Python) that forms the basis of the Spark job.
+- `spec.driver`: driver-specific settings.
+- `spec.executor`: executor-specific settings.
+
+
+NOTE: If using Stackable image versions, please note that the version you need to specify for `spec.version` is not only the version of Spark which you want to roll out, but has to be amended with a Stackable version as shown. This Stackable version is the version of the underlying container image which is used to execute the processes. For a list of available versions please check our
+https://repo.stackable.tech/#browse/browse:docker:v2%2Fstackable%spark-k8s%2Ftags[image registry].
+It should generally be safe to simply use the latest image version that is available.
+
+This will create the SparkApplication that in turn creates the Spark job.
+
+=== Initialization of the Airflow database
+
+When creating an Airflow cluster, a database-initialization job is first started to ensure that the database schema is present and correct (i.e. populated with an admin user). A Kubernetes job is created which starts a pod to initialize the database. This can take a while.
+
+You can use kubectl to wait on the resource, although the cluster itself will not be created until this step is complete.:
+
+[source,bash]
+include::example$code/getting_started.sh[tag=wait-airflowdb]
+
+The job status can be inspected and verified like this:
+
+[source,bash]
+----
+kubectl get jobs
+----
+
+which will show something like this:
+
+----
+NAME      COMPLETIONS   DURATION   AGE
+airflow   1/1           85s        11m
+----
+
+Then, make sure that all the Pods in the StatefulSets are ready:
+
+[source,bash]
+----
+kubectl get statefulset
+----
+
+The output should show all pods ready, including the external dependencies:
+
+----
+NAME                        READY   AGE
+airflow-postgresql          1/1     16m
+airflow-redis-master        1/1     16m
+airflow-redis-replicas      1/1     16m
+airflow-scheduler-default   1/1     11m
+airflow-webserver-default   1/1     11m
+airflow-worker-default      2/2     11m
+----
+
+The completed set of pods for the Airflow cluster will look something like this:
+
+image::airflow_pods.png[Airflow pods]
+
+When the Airflow cluster has been created and the database is initialized, Airflow can be opened in the
+browser: the webserver UI port defaults to `8080` can be forwarded to the local host:
+
+----
+include::example$code/getting_started.sh[tag=port-forwarding]
+----
+
+== Verify that it works
+
+The Webserver UI can now be opened in the browser with `http://localhost:8080`. Enter the admin credentials from the Kubernetes secret:
+
+image::airflow_login.png[Airflow login screen]
+
+Since the examples were loaded in the cluster definition, they will appear under the DAGs tabs:
+
+image::airflow_dags.png[Example Airflow DAGs]
+
+Select one of these DAGs by clicking on the name in the left-hand column e.g. `example_complex`. Click on the arrow in the top right of the screen, select "Trigger DAG" and the DAG nodes will be automatically highlighted as the job works through its phases.
+
+image::airflow_running.png[Airflow DAG in action]
+
+Great! You have set up an Airflow cluster, connected to it and run your first DAG!
+
+== What's next
+
+Look at the xref:ROOT:usage.adoc[Usage page] to find out more about configuring your Airflow cluster and loading your own DAG files.
diff --git a/docs/modules/getting_started/pages/index.adoc b/docs/modules/getting_started/pages/index.adoc
@@ -0,0 +1,18 @@
+= Getting started
+
+This guide will get you started with Airflow using the Stackable Operator. It will guide you through the installation of the Operator and its dependencies, setting up your first Airflow cluster and connecting to it, and viewing and running one of the example workflows (called DAGs = Direct Acyclic Graphs).
+
+== Prerequisites
+
+You will need:
+
+* a Kubernetes cluster
+* kubectl
+* Helm
+
+== What's next
+
+The Guide is divided into two steps:
+
+* xref:installation.adoc[Installing the Operators].
+* xref:first_steps.adoc[Setting up the Airflow cluster and running an example DAG].
diff --git a/docs/modules/getting_started/pages/installation.adoc b/docs/modules/getting_started/pages/installation.adoc
diff --git a/docs/templating_vars.yaml b/docs/templating_vars.yaml
diff --git a/scripts/docs_templating.sh b/scripts/docs_templating.sh

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+** xref:index.adoc[]`
	`2`	`+** xref:installation.adoc[]`
	`3`	`+** xref:first_steps.adoc[]`