chore: Add an integration test for profiling (#465)

siegfriedweber · razvan · web-flow · commit ded0d65b6314 · 2024-02-08T16:10:18.000Z
* chore: Add test for profiling

* Remove metadata from TestAsserts

* Use the repository docker.stackable.tech/sandbox/docker-images-pr540-5212814 for Hadoop in the integration tests

* Fix linter warnings

* Disable test case listener-class external-unstable

* Revert using images from the sandbox repository

* Enable test case listener-class external-unstable

* fix: openshift test

* fix: set openshift to false by default

* Rename test files and create assertions for Role, RoleBinding, and ServiceAccount

---------

Co-authored-by: Razvan-Daniel Mihai &lt;84674+razvan@users.noreply.github.com&gt;
diff --git a/tests/templates/kuttl/profiling/00-assert.yaml b/tests/templates/kuttl/profiling/00-assert.yaml
@@ -0,0 +1,18 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: test-role
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: test-rb
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: test-sa
diff --git a/tests/templates/kuttl/profiling/00-rbac.yaml.j2 b/tests/templates/kuttl/profiling/00-rbac.yaml.j2
@@ -0,0 +1,29 @@
+---
+kind: Role
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: test-role
+rules:
+{% if test_scenario['values']['openshift'] == "true" %}
+  - apiGroups: ["security.openshift.io"]
+    resources: ["securitycontextconstraints"]
+    resourceNames: ["privileged"]
+    verbs: ["use"]
+{% endif %}
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: test-sa
+---
+kind: RoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: test-rb
+subjects:
+  - kind: ServiceAccount
+    name: test-sa
+roleRef:
+  kind: Role
+  name: test-role
+  apiGroup: rbac.authorization.k8s.io
diff --git a/tests/templates/kuttl/profiling/01-assert.yaml.j2 b/tests/templates/kuttl/profiling/01-assert.yaml.j2
@@ -0,0 +1,10 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+{% if lookup('env', 'VECTOR_AGGREGATOR') %}
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vector-aggregator-discovery
+{% endif %}
diff --git a/tests/templates/kuttl/profiling/01-install-vector-aggregator-discovery-configmap.yaml.j2 b/tests/templates/kuttl/profiling/01-install-vector-aggregator-discovery-configmap.yaml.j2
@@ -0,0 +1,9 @@
+{% if lookup('env', 'VECTOR_AGGREGATOR') %}
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vector-aggregator-discovery
+data:
+  ADDRESS: {{ lookup('env', 'VECTOR_AGGREGATOR') }}
+{% endif %}
diff --git a/tests/templates/kuttl/profiling/02-assert.yaml b/tests/templates/kuttl/profiling/02-assert.yaml
@@ -0,0 +1,12 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+timeout: 600
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: test-zk-server-default
+status:
+  readyReplicas: 1
+  replicas: 1
diff --git a/tests/templates/kuttl/profiling/02-install-zookeeper.yaml.j2 b/tests/templates/kuttl/profiling/02-install-zookeeper.yaml.j2
@@ -0,0 +1,28 @@
+---
+apiVersion: zookeeper.stackable.tech/v1alpha1
+kind: ZookeeperCluster
+metadata:
+  name: test-zk
+spec:
+  image:
+    productVersion: "{{ test_scenario['values']['zookeeper-latest'] }}"
+    pullPolicy: IfNotPresent
+{% if lookup('env', 'VECTOR_AGGREGATOR') %}
+  clusterConfig:
+    vectorAggregatorConfigMapName: vector-aggregator-discovery
+{% endif %}
+  servers:
+    config:
+      logging:
+        enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }}
+    roleGroups:
+      default:
+        replicas: 1
+---
+apiVersion: zookeeper.stackable.tech/v1alpha1
+kind: ZookeeperZnode
+metadata:
+  name: test-znode
+spec:
+  clusterRef:
+    name: test-zk
diff --git a/tests/templates/kuttl/profiling/03-assert.yaml b/tests/templates/kuttl/profiling/03-assert.yaml
@@ -0,0 +1,28 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+timeout: 600
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: test-hdfs-namenode-default
+status:
+  readyReplicas: 2
+  replicas: 2
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: test-hdfs-journalnode-default
+status:
+  readyReplicas: 1
+  replicas: 1
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: test-hdfs-datanode-default
+status:
+  readyReplicas: 1
+  replicas: 1
diff --git a/tests/templates/kuttl/profiling/03-install-hdfs.yaml.j2 b/tests/templates/kuttl/profiling/03-install-hdfs.yaml.j2
@@ -0,0 +1,36 @@
+---
+apiVersion: hdfs.stackable.tech/v1alpha1
+kind: HdfsCluster
+metadata:
+  name: test-hdfs
+spec:
+  image:
+    productVersion: "{{ test_scenario['values']['hadoop'] }}"
+    pullPolicy: IfNotPresent
+  clusterConfig:
+    dfsReplication: 1
+    zookeeperConfigMapName: test-znode
+{% if lookup('env', 'VECTOR_AGGREGATOR') %}
+    vectorAggregatorConfigMapName: vector-aggregator-discovery
+{% endif %}
+  nameNodes:
+    config:
+      logging:
+        enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }}
+    roleGroups:
+      default:
+        replicas: 2
+  dataNodes:
+    config:
+      logging:
+        enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }}
+    roleGroups:
+      default:
+        replicas: 1
+  journalNodes:
+    config:
+      logging:
+        enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }}
+    roleGroups:
+      default:
+        replicas: 1
diff --git a/tests/templates/kuttl/profiling/04-assert.yaml b/tests/templates/kuttl/profiling/04-assert.yaml
@@ -0,0 +1,12 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+timeout: 300
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: test-runner
+status:
+  readyReplicas: 1
+  replicas: 1
diff --git a/tests/templates/kuttl/profiling/04-install-test-container.yaml b/tests/templates/kuttl/profiling/04-install-test-container.yaml
@@ -0,0 +1,38 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestStep
+metadata:
+  name: install-test-container
+timeout: 300
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: test-runner
+  labels:
+    app: test-runner
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: test-runner
+  template:
+    metadata:
+      labels:
+        app: test-runner
+    spec:
+      serviceAccountName: test-sa
+      securityContext:
+        fsGroup: 1000
+      containers:
+        - name: python
+          image: docker.stackable.tech/stackable/testing-tools:0.2.0-stackable0.0.0-dev
+          stdin: true
+          tty: true
+          resources:
+            requests:
+              memory: "128Mi"
+              cpu: "512m"
+            limits:
+              memory: "128Mi"
+              cpu: "1"
diff --git a/tests/templates/kuttl/profiling/05-assert.yaml b/tests/templates/kuttl/profiling/05-assert.yaml
@@ -0,0 +1,8 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestAssert
+timeout: 300
+commands:
+  - script: >-
+      kubectl exec -n $NAMESPACE test-runner-0 --
+      python /stackable/run-profiler.py
diff --git a/tests/templates/kuttl/profiling/05-run-profiler.yaml b/tests/templates/kuttl/profiling/05-run-profiler.yaml
@@ -0,0 +1,8 @@
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestStep
+metadata:
+  name: run-profiler
+commands:
+  - script: |-
+      kubectl cp run-profiler.py $NAMESPACE/test-runner-0:/stackable
diff --git a/tests/templates/kuttl/profiling/run-profiler.py b/tests/templates/kuttl/profiling/run-profiler.py
@@ -0,0 +1,74 @@
+import re
+import requests
+import time
+
+EVENT_TYPE = "itimer"
+PROFILING_DURATION_IN_SEC = 1
+
+
+def start_profiling_and_get_refresh_header(service_url):
+    prof_page = requests.get(
+        f"{service_url}/prof"
+        f"?event={EVENT_TYPE}&duration={PROFILING_DURATION_IN_SEC}")
+
+    assert prof_page.ok, \
+        f"""Profiling could not be started.
+        URL: {prof_page.request.url}
+        Status Code: {prof_page.status_code}"""
+
+    return prof_page.headers['Refresh']
+
+
+def parse_refresh_header(refresh_header):
+    refresh_time_in_sec, refresh_path = refresh_header.split(';', 1)
+    refresh_time_in_sec = int(refresh_time_in_sec)
+
+    assert refresh_time_in_sec == PROFILING_DURATION_IN_SEC, \
+        f"""Profiling duration and refresh time should be equal.
+        expected: {PROFILING_DURATION_IN_SEC}
+        actual:   {refresh_time_in_sec}"""
+
+    expected_refresh_path_pattern = \
+        r'/prof-output-hadoop/async-prof-pid-\d+-itimer-\d+.html'
+    assert re.fullmatch(expected_refresh_path_pattern, refresh_path), \
+        f"""The path to the flamegraph contains an unexpected pattern.
+        expected pattern: {expected_refresh_path_pattern}"
+        actual path:      {refresh_path}"""
+
+    return refresh_time_in_sec, refresh_path
+
+
+def wait_for_profiling_to_finish(refresh_time_in_sec):
+    additional_sleep_time_in_sec = 2
+    time.sleep(refresh_time_in_sec + additional_sleep_time_in_sec)
+
+
+def fetch_flamegraph(service_url, refresh_path):
+    flamegraph_page = requests.get(f"{service_url}{refresh_path}")
+
+    assert flamegraph_page.ok, \
+        f"""The flamegraph could not be fetched.
+        URL: {flamegraph_page.request.url}
+        Status Code: {flamegraph_page.status_code}"""
+
+
+def test_profiling(role, port):
+    service_url = (
+        f"http://test-hdfs-{role}-default-0.test-hdfs-{role}-default"
+        f":{port}")
+
+    print(f"Test profiling on {service_url}")
+
+    refresh_header = start_profiling_and_get_refresh_header(service_url)
+
+    refresh_time_in_sec, refresh_path = \
+        parse_refresh_header(refresh_header)
+
+    wait_for_profiling_to_finish(refresh_time_in_sec)
+
+    fetch_flamegraph(service_url, refresh_path)
+
+
+test_profiling(role="namenode", port=9870)
+test_profiling(role="datanode", port=9864)
+test_profiling(role="journalnode", port=8480)
diff --git a/tests/test-definition.yaml b/tests/test-definition.yaml
@@ -4,6 +4,7 @@ dimensions:
     values:
       - 3.2.4
       - 3.3.4
+      - 3.3.6
   - name: hadoop-latest
     values:
       - 3.3.4
@@ -39,6 +40,9 @@ dimensions:
       # Requires manual setup, see create-kerberos-secretclass.yaml
       # This will *not* respect the kerberos-realm test attribute, but instead use a hard-coded realm
       # - activeDirectory
+  - name: openshift
+    values:
+      - "false"
 tests:
   - name: smoke
     dimensions:
@@ -66,6 +70,11 @@ tests:
     dimensions:
       - hadoop-latest
       - zookeeper-latest
+  - name: profiling
+    dimensions:
+      - hadoop
+      - zookeeper-latest
+      - openshift
   # Broken due to https://github.com/kudobuilder/kuttl/issues/322, see 40-assert.yaml for more details
   # - name: external-access
   #   dimensions: