pandas-dev · DeaMariaLeon · Jan 26, 2024
diff --git a/conbench-poc/.gitignore b/conbench-poc/.gitignore
@@ -0,0 +1,16 @@
+
+.DS_Store
+.vscode/
+Deas-MacBook-Air.local/
+__pycache__/
+asv_files/**
+my-files/
+no_results/
+failing/
+asv_files_ALL/
+algos2_results/
+local_env.yml
+server_env.yml
+benchmarks.json
+asv_processed_files
+alert_processed_files
diff --git a/conbench-poc/README.md b/conbench-poc/README.md
@@ -0,0 +1,19 @@
+# Conbench PoC for pandas
+
+
+The **purpose** of adding conbench to the current pandas benchmark system 
+is:
+1. To improve the UI.
+2. Use conbench statistical analysis and detection of regression/improvement
+3. Add an automatic alert system for  regressions or improvements.
+
+## Files description
+**client.py:** Calls the adapter asvbench.py and posts to a conbench web app. <br/>
+**asvbench.py:** Converts asv's benchmarks results to conbench format. <br/>
+**alert.py:** Runs conbench alert pipeline, generates a report and sends alerts. <br/>
+**benchmark_email.py:** Handles the email. <br/>
+**utilities.py:** setup env variables, reads files. <br/>
+**setup_server.txt:** Steps to install this PoC. <br/>
+
+## PoC structure/setup
+![Setup](setup_pic.png "Setup")
diff --git a/conbench-poc/alert.py b/conbench-poc/alert.py
@@ -0,0 +1,90 @@
+import os
+from utilities import Environment, alerts_done_file, check_new_files
+import benchalerts.pipeline_steps as steps
+from benchalerts.integrations.github import CheckStatus
+import benchmark_email
+import re
+import json
+#from benchalerts.pipeline_steps.slack import (
+#    SlackErrorHandler,
+#)
+from benchalerts import AlertPipeline, Alerter
+from benchalerts.integrations.github import GitHubRepoClient
+import asvbench
+from benchalerts.conbench_dataclasses import FullComparisonInfo
+import pandas as pd
+
+env = Environment()
+
+repo = env.GITHUB_REPOSITORY
+
+def alert_instance(commit_hash):
+
+    # Create a pipeline to update a GitHub Check
+    pipeline = AlertPipeline(
+        steps=[
+            steps.GetConbenchZComparisonStep(
+                commit_hash=commit_hash,
+                #baseline_run_type=steps.BaselineRunCandidates.fork_point,
+                #baseline_run_type=steps.BaselineRunCandidates.latest_default,
+                baseline_run_type=steps.BaselineRunCandidates.parent,
+                z_score_threshold=5.5, #If not set, defaults to 5
+            ),
+            #steps.GitHubCheckStep(
+            #    commit_hash=commit_hash,
+            #    comparison_step_name="GetConbenchZComparisonStep",
+            #    github_client=GitHubRepoClient(repo=repo),
+            #    #build_url=build_url,
+            #),
+            #steps.SlackMessageAboutBadCheckStep(
+            #   channel_id="conbench-poc",
+            #),
+
+            ],
+        #error_handlers=[
+        #    steps.GitHubCheckErrorHandler(
+        #        commit_hash=commit_hash, repo=repo, #build_url=build_url
+        #    )
+        #],
+        )
+    return pipeline
+
+    # To see the whole report, look at:
+    # pipeline.run_pipeline()['GetConbenchZComparisonStep'].results_with_z_regressions
+def report(pipeline):
+    full_comparison_info = pipeline.run_pipeline()['GetConbenchZComparisonStep']
+    alerter = Alerter()
+    if alerter.github_check_status(full_comparison_info) == CheckStatus.FAILURE:
+
+        message = """Subject: Benchmarks Alert \n\n """ \
+                  + alerter.github_check_summary(full_comparison_info, "")
+        #TODO add links to message
+        #cleaned_message = re.sub(r'0\.0\.0\.0', '127.0.0.1', message) #local
+        correctserver = re.sub(r'0\.0\.0\.0', '57.128.112.95', message) #new server
+        cleaned_message = re.sub(r'- Commit Run.+\)|#| All benchmark runs analyzed:', '', correctserver)
+        #send message or cleaned_message
+        benchmark_email.email(cleaned_message)
+
+def alert() -> None:
+
+    #while True:
+    with open(env.ASV_PROCESSED_FILES, "r+") as f:
+        processed_files = f.read().split('\n')
+
+    for new_file in (set(processed_files) - set(alerts_done_file(env))):   
+        with open(new_file, "r") as f:           
+            benchmarks_results = json.load(f)
+        pipeline = alert_instance(benchmarks_results['commit_hash'])
+        report(pipeline)
+
+        with open(env.ALERT_PROCESSED_FILES, "a") as f:
+            f.write(new_file)
+            f.write("\n")
+
+
+if __name__ == "__main__":
+    #commit_hash = 'acf5d7d84187b5ba53e54b2a5d91a34725814bf9' #old server
+    #commit_hash = 'fce520d45a304ee2659bb4156acf484cee5aea07' #new server
+    #commit_hash = "c8a9c2fd3bcf23a21acfa6f4cffbc4c9360b9ea6" #local
+
+    alert()
diff --git a/conbench-poc/asvbench.py b/conbench-poc/asvbench.py
@@ -0,0 +1,149 @@
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+import itertools
+import numpy as np
+import os
+from datetime import datetime
+
+
+from benchadapt.adapters._adapter import BenchmarkAdapter
+from benchadapt.result import BenchmarkResult
+
+class AsvBenchmarkAdapter(BenchmarkAdapter):
+
+    def __init__(
+        self,
+        command: List[str],
+        result_file: Path,
+        benchmarks_file_path: str,
+        result_fields_override: Dict[str, Any] = None,
+        result_fields_append: Dict[str, Any] = None,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        command : List[str]
+            A list of strings defining a shell command to run benchmarks
+        result_dir : Path
+            Path to directory where results will be populated
+        result_fields_override : Dict[str, Any]
+            A dict of values to override on each instance of `BenchmarkResult`. Useful
+            for specifying metadata only available at runtime, e.g. build info. Applied
+            before ``results_field_append``.
+        results_fields_append : Dict[str, Any]
+            A dict of default values to be appended to `BenchmarkResult` values after
+            instantiation. Useful for appending extra tags or other metadata in addition
+            to that gathered elsewhere. Only applicable for dict attributes. For each
+            element, will override any keys that already exist, i.e. it does not append
+            recursively.
+        """
+        self.result_file = result_file
+        self.benchmarks_file_path=benchmarks_file_path
+        super().__init__(
+            command=command,
+            result_fields_override=result_fields_override,
+            result_fields_append=result_fields_append,
+        )
+
+    def _transform_results(self) -> List[BenchmarkResult]:
+        """Transform asv results into a list of BenchmarkResults instances"""
+        parsed_benchmarks = []
+
+        with open(self.result_file, "r") as f:           
+            benchmarks_results = json.load(f)
+
+        benchmarks_file = self.benchmarks_file_path + "benchmarks.json"
+        with open(benchmarks_file) as f:
+            benchmarks_info = json.load(f)
+
+        parsed_benchmarks = self._parse_results(benchmarks_results, benchmarks_info)
+
+        return parsed_benchmarks
+
+    def _parse_results(self, benchmarks_results, benchmarks_info):
+        # From asv documention "result_columns" is a list of column names for the results dictionary. 
+        # ["result", "params", "version", "started_at", "duration", "stats_ci_99_a", "stats_ci_99_b", 
+        # "stats_q_25", "stats_q_75", "stats_number", "stats_repeat", "samples", "profile"] 
+        # In this first version of the adapter we are using only the "result" column. 
+        # TODO: use the "samples" column instead.
+        try:
+           result_columns = benchmarks_results["result_columns"]
+        except:
+           raise Exception("Incorrect file format") 
+        parsed_benchmarks = []
+
+        for name in benchmarks_results["results"]:
+            #Bug with this benchmark: series_methods.ToFrame.time_to_frame
+            if name == "series_methods.ToFrame.time_to_frame":
+                continue
+            #print(name)
+            try:     
+                result_dict = dict(zip(result_columns, 
+                                benchmarks_results["results"][name]))
+                for param_values, data in zip(
+                    itertools.product(*result_dict["params"]),
+                    result_dict['result']
+                    ):
+                    if np.isnan(data):
+                            #print('failing ', name)
+                            continue   
+                    param_dic = dict(zip(benchmarks_info[name]["param_names"],
+                                     param_values))      
+                    tags = {}
+                    tags["name"] = name
+                    tags.update(param_dic)
+                    #asv units are seconds or bytes, conbench uses "s" or "B"
+                    units = {"seconds": "s",
+                             "bytes": "B"} 
+                    params = benchmarks_results["params"]
+                    parsed_benchmark = BenchmarkResult(
+                        #batch_id=str(self.result_file), #CORRECT THIS
+                        stats={
+                            #asv returns one value wich is the average of the iterations
+                            #but it can be changed so it returns the value of each iteration
+                            #if asv returns the value of each iteration, the variable "data"
+                            #will be a list, so this needs to be addressed below
+                            "data": [data],  
+                            "unit": units[benchmarks_info[name]['unit']],
+                            #iterations below is for conbench, 1 if we only provide a value
+                            #if we run asv to return the value of each iteration (in data above)
+                            #iterations should match the number of values
+                            "iterations": 1, 
+                        },
+                        tags=tags,
+                        context={"benchmark_language": "Python",
+                                 "env_name": benchmarks_results["env_name"],
+                                 "python": benchmarks_results["python"],
+                                 "requirements": benchmarks_results["requirements"],
+                                 },
+                        github={"repository": os.environ["REPOSITORY"],
+                                "commit":benchmarks_results["commit_hash"],
+                                },
+                        info={"date": str(datetime.fromtimestamp(benchmarks_results["date"]/1e3)),
+                             },
+                        machine_info={
+                             "name": params["machine"],
+                             "os_name": params["os"],
+                             "os_version":params["os"],
+                             "architecture_name": params["arch"],
+                             "kernel_name": "x",
+                             "memory_bytes": 0,
+                             "cpu_model_name": params["cpu"],
+                             "cpu_core_count": params["num_cpu"],
+                             "cpu_thread_count": 0,
+                             "cpu_l1d_cache_bytes": 0,
+                             "cpu_l1i_cache_bytes": 0,
+                             "cpu_l2_cache_bytes": 0,
+                             "cpu_l3_cache_bytes": 0,
+                             "cpu_frequency_max_hz": 0,
+                             "gpu_count": 0,
+                             "gpu_product_names": [],      
+                               }
+                    )
+                    parsed_benchmarks.append(parsed_benchmark)
+            except:
+                continue
+
+        return parsed_benchmarks
+
diff --git a/conbench-poc/benchmark_email.py b/conbench-poc/benchmark_email.py
@@ -0,0 +1,31 @@
+
+import smtplib, ssl
+import os
+from dotenv import load_dotenv
+import socket
+
+if socket.gethostname().startswith('Deas'):
+      load_dotenv(dotenv_path="./local_env.yml")
+else:
+      load_dotenv(dotenv_path="./server_env.yml")
+
+def email(message):
+
+    port = 465  # For SSL
+    sender_email = "[email protected]"
+    receiver_email = ["[email protected]"]
+    gmail_password=os.getenv("GMAIL_PASSWORD")
+
+    # Create a secure SSL context
+    context = ssl.create_default_context()
+    with smtplib.SMTP_SSL("smtp.gmail.com", port, context=context) as server:
+        server.login("[email protected]", gmail_password)
+        print(message)
+        server.sendmail(sender_email, receiver_email, message)
+
+if __name__=="__main__":
+    message = """\
+        Subject: Hello
+
+        Message sent from conbenchalert."""
+    email(message)
diff --git a/conbench-poc/client.py b/conbench-poc/client.py
@@ -0,0 +1,36 @@
+from asvbench import AsvBenchmarkAdapter
+from pathlib import Path
+import os
+import time
+import alert
+from utilities import Environment, check_new_files
+
+env = Environment()
+
+def adapter_instance(file_to_read) -> None:
+    adapter = AsvBenchmarkAdapter(
+    command=["echo", str(file_to_read)],
+    result_file=Path(file_to_read),
+    result_fields_override={
+        "run_reason": env.CONBENCH_RUN_REASON,
+    },
+    benchmarks_file_path=env.BENCHMARKS_FILE_PATH,
+    )
+    adapter.run()
+    adapter.post_results()
+
+
+def post_data() -> None:
+
+   while True:
+       all_files, processed_files = check_new_files(env)
+       for new_file in (set(all_files) - set(processed_files)):
+           adapter_instance(new_file)
+           with open(env.ASV_PROCESSED_FILES, "a") as f:
+               f.write(new_file)
+               f.write("\n") 
+       time.sleep(30) #adjust this on server
+
+if __name__=="__main__":
+    post_data()
+
diff --git a/conbench-poc/setup_pic.png b/conbench-poc/setup_pic.png