Skip to content
This repository was archived by the owner on Feb 16, 2024. It is now read-only.

Commit 5b0b4a9

Browse files
committed
Spark anomaly detection demo (#155)
## Description A demo using spark-k8s to create anomaly detection scores from NYC taxi data, writing the results to a Trino table and displaying them in a Superset dashboard. Closes #142.
1 parent b43020d commit 5b0b4a9

19 files changed

+710
-0
lines changed

demos/demos-v1.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,21 @@ demos:
5353
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/nifi-kafka-druid-water-level-data/create-nifi-ingestion-job.yaml
5454
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/nifi-kafka-druid-water-level-data/create-druid-ingestion-job.yaml
5555
- plainYaml: https://raw.githubusercontent.com/stackabletech/stackablectl/main/demos/nifi-kafka-druid-water-level-data/setup-superset.yaml
56+
spark-k8s-anomaly-detection-taxi-data:
57+
description: Demo loading New York taxi data into an S3 bucket and carrying out an anomaly detection analysis on it
58+
documentation: https://docs.stackable.tech/stackablectl/stable/demos/spark-k8s-anomaly-detection-taxi-data.html
59+
stackableStack: spark-trino-superset-s3
60+
labels:
61+
- trino
62+
- superset
63+
- minio
64+
- s3
65+
- ny-taxi-data
66+
manifests:
67+
- plainYaml: demos/spark-k8s-anomaly-detection-taxi-data/serviceaccount.yaml
68+
- plainYaml: demos/spark-k8s-anomaly-detection-taxi-data/load-test-data.yaml
69+
- plainYaml: demos/spark-k8s-anomaly-detection-taxi-data/create-spark-anomaly-detection-job.yaml
70+
- plainYaml: demos/spark-k8s-anomaly-detection-taxi-data/setup-superset.yaml
5671
trino-taxi-data:
5772
description: Demo loading 2.5 years of New York taxi data into S3 bucket, creating a Trino table and a Superset dashboard
5873
documentation: https://docs.stackable.tech/stackablectl/stable/demos/trino-taxi-data.html
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
---
2+
apiVersion: batch/v1
3+
kind: Job
4+
metadata:
5+
name: create-spark-anomaly-detection-job
6+
spec:
7+
template:
8+
spec:
9+
initContainers:
10+
- name: wait-for-testdata
11+
image: docker.stackable.tech/stackable/tools:0.2.0-stackable0.3.0
12+
command: ["bash", "-c", "echo 'Waiting for job load-ny-taxi-data to finish' && kubectl wait --for=condition=complete --timeout=30m job/load-ny-taxi-data"]
13+
containers:
14+
- name: create-spark-anomaly-detection-job
15+
image: docker.stackable.tech/stackable/tools:0.2.0-stackable0.3.0
16+
command: ["bash", "-c", "echo 'Submitting Spark job' && kubectl apply -f /tmp/manifest/spark-ad-job.yaml"]
17+
volumeMounts:
18+
- name: manifest
19+
mountPath: /tmp/manifest
20+
volumes:
21+
- name: manifest
22+
configMap:
23+
name: create-spark-ad-job-manifest
24+
restartPolicy: OnFailure
25+
backoffLimit: 50
26+
---
27+
apiVersion: v1
28+
kind: ConfigMap
29+
metadata:
30+
name: create-spark-ad-job-manifest
31+
data:
32+
spark-ad-job.yaml: |
33+
---
34+
apiVersion: spark.stackable.tech/v1alpha1
35+
kind: SparkApplication
36+
metadata:
37+
name: spark-ad
38+
spec:
39+
version: "1.0"
40+
sparkImage: docker.stackable.tech/demos/pyspark-k8s-with-kafka-and-iceberg:3.3.0-stackable0.2.0
41+
mode: cluster
42+
mainApplicationFile: local:///spark-scripts/spark-ad.py
43+
deps:
44+
requirements:
45+
- scikit-learn==0.24.2
46+
volumes:
47+
- name: cm-spark
48+
configMap:
49+
name: cm-spark
50+
job:
51+
resources:
52+
cpu:
53+
min: "100m"
54+
max: "500m"
55+
memory:
56+
limit: "1Gi"
57+
driver:
58+
resources:
59+
cpu:
60+
min: "2"
61+
max: "3"
62+
memory:
63+
limit: "2Gi"
64+
volumeMounts:
65+
- name: cm-spark
66+
mountPath: /spark-scripts
67+
executor:
68+
resources:
69+
cpu:
70+
min: "2"
71+
max: "3"
72+
memory:
73+
limit: "5Gi"
74+
volumeMounts:
75+
- name: cm-spark
76+
mountPath: /spark-scripts
77+
sparkConf:
78+
spark.kubernetes.submission.waitAppCompletion: "false"
79+
spark.kubernetes.driver.pod.name: "spark-ad-driver"
80+
spark.kubernetes.executor.podNamePrefix: "spark-ad"
81+
spark.executor.instances: "6"
82+
spark.hadoop.fs.s3a.aws.credentials.provider: "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"
83+
spark.hadoop.fs.s3a.endpoint: http://minio-trino:9000
84+
spark.hadoop.fs.s3a.path.style.access: "true"
85+
spark.hadoop.fs.s3a.access.key: demo
86+
spark.hadoop.fs.s3a.secret.key: demodemo
87+
spark.sql.extensions: org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
88+
spark.sql.catalog.prediction: org.apache.iceberg.spark.SparkCatalog
89+
spark.sql.catalog.prediction.type: hive
90+
spark.sql.catalog.prediction.uri: thrift://hive-iceberg:9083
91+
---
92+
apiVersion: v1
93+
kind: ConfigMap
94+
metadata:
95+
name: cm-spark
96+
data:
97+
spark-ad.py: |
98+
from pyspark.sql import SparkSession
99+
from pyspark.sql.functions import dayofweek, to_date, to_timestamp, date_format, year, hour, minute, month, when, dayofmonth, dayofweek
100+
from pyspark.sql.functions import concat_ws, substring, concat, lpad, lit
101+
from pyspark.sql.functions import round, sum, count, avg
102+
from pyspark.sql.functions import lag
103+
from pyspark.sql.window import Window
104+
from pyspark.sql import functions, types
105+
from sklearn.ensemble import IsolationForest
106+
from sklearn.preprocessing import StandardScaler
107+
108+
spark = SparkSession.builder.appName("ny-tlc-anomaly-detection").getOrCreate()
109+
spark.sql("CREATE SCHEMA IF NOT EXISTS prediction.ad LOCATION 's3a://prediction/anomaly-detection'")
110+
spark.sql("CREATE TABLE IF NOT EXISTS prediction.ad.iforest (pickup_ts TIMESTAMP, pickup_minute_group VARCHAR(4), pickup_hour INT, pickup_year INT, pickup_month INT, pickup_dayofmonth INT, pickup_dayofweek INT, norides INT, total_bill DOUBLE, avg_bill DOUBLE, norides_lag INT, pred INT) USING iceberg")
111+
112+
input_df = spark.read.parquet("s3a://demo/ny-taxi-data/raw/")
113+
114+
df = input_df.select(
115+
to_date(input_df.pickup_datetime).alias("day_date")
116+
, year(input_df.pickup_datetime).alias('year')
117+
, month(input_df.pickup_datetime).alias('month')
118+
, dayofmonth(input_df.pickup_datetime).alias("dayofmonth")
119+
, dayofweek(input_df.pickup_datetime).alias("dayofweek")
120+
, hour(input_df.pickup_datetime).alias("hour")
121+
, minute(input_df.pickup_datetime).alias("minute")
122+
, input_df.driver_pay
123+
)
124+
125+
df = df.withColumn("minute_group", when(df.minute < 30, '00').otherwise('30'))
126+
df = df.withColumn("time_group",concat_ws(":", lpad(df.hour, 2, '0'), df.minute_group, lit('00')))
127+
df = df.withColumn("ts",concat_ws(" ", df.day_date, df.time_group))
128+
129+
dfs = df.select(
130+
to_timestamp(df.ts, "yyyy-MM-dd HH:mm:ss").alias("date_group")
131+
, df.minute_group
132+
, df.year
133+
, df.hour
134+
, df.month
135+
, df.dayofmonth
136+
, df.dayofweek
137+
, df.driver_pay
138+
).groupby("date_group", "minute_group", "hour", "year", "month", "dayofmonth", "dayofweek").agg(functions.count('driver_pay').alias('no_rides'), functions.round(functions.sum('driver_pay'), 2).alias('total_bill'), functions.round(functions.avg('driver_pay'), 2).alias('avg_bill')).orderBy("date_group")
139+
140+
windowSpec = Window.partitionBy("hour").orderBy("date_group")
141+
142+
dfs = dfs.withColumn("lag",lag("no_rides",2).over(windowSpec))
143+
dfs = dfs.filter("lag IS NOT NULL")
144+
145+
scaler = StandardScaler()
146+
classifier = IsolationForest(contamination=0.005, n_estimators=200, max_samples=0.7, random_state=42, n_jobs=-1)
147+
148+
df_model = dfs.select(dfs.minute_group, dfs.hour, dfs.year, dfs.month, dfs.dayofmonth, dfs.dayofweek, dfs.no_rides, dfs.total_bill, dfs.avg_bill, dfs.lag)
149+
150+
x_train = scaler.fit_transform(df_model.collect())
151+
clf = classifier.fit(x_train)
152+
153+
SCL = spark.sparkContext.broadcast(scaler)
154+
CLF = spark.sparkContext.broadcast(clf)
155+
156+
def predict_using_broadcasts(minute_group, hour, year, month, dayofmonth, dayofweek, no_rides, total_bill, avg_bill, lag):
157+
prediction = 0
158+
x_test = [[minute_group, hour, year, month, dayofmonth, dayofweek, no_rides, total_bill, avg_bill, lag]]
159+
try:
160+
x_test = SCL.value.transform(x_test)
161+
prediction = CLF.value.predict(x_test)[0]
162+
except ValueError:
163+
import traceback
164+
traceback.print_exc()
165+
print('Cannot predict:', x_test)
166+
return int(prediction)
167+
168+
udf_predict_using_broadcasts = functions.udf(predict_using_broadcasts, types.IntegerType())
169+
170+
df_pred = dfs.withColumn(
171+
'prediction',
172+
udf_predict_using_broadcasts('minute_group', 'hour', 'year', 'month', 'dayofmonth', 'dayofweek', 'no_rides', 'total_bill', 'avg_bill', 'lag')
173+
)
174+
175+
# map to table columns
176+
df_out = df_pred.select(
177+
df_pred.date_group.alias("pickup_ts")
178+
, df_pred.minute_group.alias("pickup_minute_group")
179+
, df_pred.hour.alias("pickup_hour")
180+
, df_pred.year.alias("pickup_year")
181+
, df_pred.month.alias("pickup_month")
182+
, df_pred.dayofmonth.alias("pickup_dayofmonth")
183+
, df_pred.dayofweek.alias("pickup_dayofweek")
184+
, df_pred.no_rides.alias("norides")
185+
, df_pred.total_bill.alias("total_bill")
186+
, df_pred.avg_bill.alias("avg_bill")
187+
, df_pred.lag.alias("norides_lag")
188+
, df_pred.prediction.alias("pred")
189+
)
190+
191+
# write via iceberg
192+
df_out.writeTo("prediction.ad.iforest").append()
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
apiVersion: batch/v1
3+
kind: Job
4+
metadata:
5+
name: load-ny-taxi-data
6+
spec:
7+
template:
8+
spec:
9+
containers:
10+
- name: load-ny-taxi-data
11+
image: "bitnami/minio:2022-debian-10"
12+
command: ["bash", "-c", "cd /tmp && for month in 2020-09 2020-10 2020-11 2020-12; do curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/fhvhv_tripdata_$month.parquet && mc --insecure alias set minio http://minio-trino:9000/ demo demodemo && mc cp fhvhv_tripdata_$month.parquet minio/demo/ny-taxi-data/raw/ && mc mb --ignore-existing minio/prediction; done"]
13+
restartPolicy: OnFailure
14+
backoffLimit: 50
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
---
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
kind: ClusterRoleBinding
4+
metadata:
5+
name: demo-clusterrolebinding
6+
subjects:
7+
- apiGroup: rbac.authorization.k8s.io
8+
kind: Group
9+
name: system:serviceaccounts
10+
roleRef:
11+
kind: ClusterRole
12+
name: demo-clusterrole
13+
apiGroup: rbac.authorization.k8s.io
14+
---
15+
apiVersion: rbac.authorization.k8s.io/v1
16+
kind: ClusterRole
17+
metadata:
18+
name: demo-clusterrole
19+
rules:
20+
- apiGroups:
21+
- ""
22+
resources:
23+
- pods
24+
verbs:
25+
- get
26+
- list
27+
- watch
28+
- apiGroups:
29+
- batch
30+
resources:
31+
- jobs
32+
verbs:
33+
- get
34+
- list
35+
- watch
36+
- apiGroups:
37+
- spark.stackable.tech
38+
resources:
39+
- sparkapplications
40+
verbs:
41+
- get
42+
- list
43+
- watch
44+
- create
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
---
2+
apiVersion: batch/v1
3+
kind: Job
4+
metadata:
5+
name: setup-superset
6+
spec:
7+
template:
8+
spec:
9+
containers:
10+
- name: setup-superset
11+
image: docker.stackable.tech/stackable/testing-tools:0.1.0-stackable0.1.0
12+
# TODO update
13+
command: ["bash", "-c", "curl -o superset-assets.zip https://raw.githubusercontent.com/stackabletech/stackablectl/142_spark_anomaly_detection/demos/spark-k8s-anomaly-detection-taxi-data/superset-assets.zip && python -u /tmp/script/script.py"]
14+
volumeMounts:
15+
- name: script
16+
mountPath: /tmp/script
17+
volumes:
18+
- name: script
19+
configMap:
20+
name: setup-superset-script
21+
restartPolicy: OnFailure
22+
backoffLimit: 50
23+
---
24+
apiVersion: v1
25+
kind: ConfigMap
26+
metadata:
27+
name: setup-superset-script
28+
data:
29+
script.py: |
30+
import logging
31+
import requests
32+
33+
base_url = "http://superset-external:8088" # For local testing / developing replace it, afterwards change back to http://superset-external:8088
34+
username = "admin"
35+
password = "admin"
36+
37+
logging.basicConfig(level=logging.INFO)
38+
logging.info("Starting setup of Superset")
39+
logging.info("Getting access token from /api/v1/security/login")
40+
session = requests.session()
41+
access_token = session.post(f"{base_url}/api/v1/security/login", json={"username": username, "password": password, "provider": "db", "refresh": True}).json()['access_token']
42+
43+
logging.info("Getting csrf token from /api/v1/security/csrf_token")
44+
csrf_token = session.get(f"{base_url}/api/v1/security/csrf_token", headers={"Authorization": f"Bearer {access_token}"}).json()["result"]
45+
46+
headers = {
47+
"accept": "application/json",
48+
"Authorization": f"Bearer {access_token}",
49+
"X-CSRFToken": csrf_token,
50+
}
51+
52+
#########################
53+
# Export
54+
#########################
55+
#logging.info("Exporting all assets")
56+
#result = session.get(f"{base_url}/api/v1/assets/export", headers=headers)
57+
#assert result.status_code == 200
58+
#with open("superset-assets.zip", "wb") as f:
59+
# f.write(result.content)
60+
61+
#########################
62+
# Import
63+
#########################
64+
logging.info("Importing all assets")
65+
files = {
66+
"bundle": ("superset-assets.zip", open("superset-assets.zip", "rb")),
67+
}
68+
data = {
69+
"passwords": '{"databases/Trino.yaml": "demo"}'
70+
}
71+
result = session.post(f"{base_url}/api/v1/assets/import", headers=headers, files=files, data=data)
72+
print(result)
73+
print(result.text)
74+
assert result.status_code == 200
75+
76+
logging.info("Finished setup of Superset")
Binary file not shown.
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading

0 commit comments

Comments
 (0)