Skip to content

Added Benchmarking #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: jprakash-db/PECO-1803
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions benchmarking/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
import pytest


@pytest.fixture(scope="session")
def benchmarking_host():
return os.getenv("BENCHMARKING_SERVER_HOSTNAME")


@pytest.fixture(scope="session")
def benchmarking_http_path():
return os.getenv("BENCHMARKING_HTTP_PATH")


@pytest.fixture(scope="session")
def benchmarking_access_token():
return os.getenv("BENCHMARKING_TOKEN")


@pytest.fixture(scope="session")
def benchfood_host():
return os.getenv("BENCHFOOD_SERVER_HOSTNAME")


@pytest.fixture(scope="session")
def benchfood_http_path():
return os.getenv("BENCHFOOD_HTTP_PATH")


@pytest.fixture(scope="session")
def benchfood_access_token():
return os.getenv("BENCHFOOD_TOKEN")


@pytest.fixture(scope="session", autouse=True)
def connection_details(benchmarking_host, benchmarking_http_path, benchmarking_access_token, benchfood_host, benchfood_http_path, benchfood_access_token):
return {
"benchmarking_host": benchmarking_host,
"benchmarking_http_path": benchmarking_http_path,
"benchmarking_access_token": benchmarking_access_token,
"benchfood_host": benchfood_host,
"benchfood_http_path": benchfood_http_path,
"benchfood_access_token": benchfood_access_token,
}
116 changes: 116 additions & 0 deletions benchmarking/test_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import random
import time
from databricks import sql
import logging
import pytest
from contextlib import contextmanager
from datetime import datetime
log = logging.getLogger(__name__)


class TestBenchmarkingSuite:

# TAG = "PRE-SPLIT"
TAG = "POST-SPLIT"
CATALOG_NAME = "main"
SCHEMA_NAME = "tpcds_sf100_delta"
TABLE_NAME = "catalog_sales"
RESULTS_TABLE = "main.pysql_benchmarking_schema.benchmarking_results"
ATTEMPTS = 10
ROWS = 1000000
LARGE_QUERY_LIMIT = 1000000
SMALL_QUERY_LIMIT = 10000

@pytest.fixture(autouse=True)
def get_details(self, connection_details):
self.arguments = connection_details.copy()

self.benchmarking_connection_params = {
"server_hostname": self.arguments["benchmarking_host"],
"http_path": self.arguments["benchmarking_http_path"],
"access_token": self.arguments["benchmarking_access_token"]
}

self.benchfood_connection_params = {
"server_hostname": self.arguments["benchfood_host"],
"http_path": self.arguments["benchfood_http_path"],
"access_token": self.arguments["benchfood_access_token"]
}

@contextmanager
def connection(self, connection_params):
log.info("Connecting with args: {}".format(connection_params))
conn = sql.connect(**connection_params)

try:
yield conn
finally:
conn.close()

@contextmanager
def cursor(self, connection_params):
with self.connection(connection_params) as conn:
cursor = conn.cursor()
try:
yield cursor
finally:
cursor.close()

def removed_outlier_mean(self, data):
total = 0
for i in range(1, len(data)-1):
total += data[i]

return total/(len(data)-2)

def insert_benchmarking_results_data(self, function_name, query_time):

log.info(f"Inserting results {self.TAG} - {function_name}")
with self.cursor(self.benchfood_connection_params) as cursor:
cursor.execute(
f"INSERT INTO {self.RESULTS_TABLE} (tag, function_name, compute_duration, date_time) VALUES ('{self.TAG}', '{function_name}', {query_time}, '{datetime.now()}')"
)

def get_query_time(self, query, expected_num_rows):
start_time = time.time()
with self.cursor(self.benchmarking_connection_params) as cursor:
cursor.execute(query)
result = cursor.fetchall()
log.info("Fetched {} rows".format(len(result)))

assert len(result) == expected_num_rows

end_time = time.time()
elapsed_time = end_time - start_time

return elapsed_time

def test_large_queries_performance(self):
compute_duration = []
function_name = "large_query"

for i in range(0, self.ATTEMPTS):
log.info("Attempt: {}".format(i))
offset = i * self.LARGE_QUERY_LIMIT + random.randint(1, self.LARGE_QUERY_LIMIT)

query = "select * from {}.{}.{} LIMIT {} OFFSET {}".format(self.CATALOG_NAME, self.SCHEMA_NAME, self.TABLE_NAME, self.LARGE_QUERY_LIMIT, offset)
compute_duration.append(self.get_query_time(query, self.LARGE_QUERY_LIMIT))

compute_duration.sort()
self.insert_benchmarking_results_data(function_name, self.removed_outlier_mean(compute_duration))

def test_small_queries_performance(self):
compute_duration = []
function_name = "small_query"

for i in range(0, self.ATTEMPTS):
log.info("Attempt: {}".format(i))
offset = i * self.SMALL_QUERY_LIMIT + random.randint(1, self.SMALL_QUERY_LIMIT)

query = "select * from {}.{}.{} LIMIT {} OFFSET {}".format(self.CATALOG_NAME, self.SCHEMA_NAME, self.TABLE_NAME, self.SMALL_QUERY_LIMIT, offset)
compute_duration.append(self.get_query_time(query, self.SMALL_QUERY_LIMIT))

compute_duration.sort()
self.insert_benchmarking_results_data(function_name, self.removed_outlier_mean(compute_duration))


10 changes: 5 additions & 5 deletions check.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,20 @@
# Load environment variables from .env file
# load_dotenv()

host = "e2-dogfood.staging.cloud.databricks.com"
http_path = "/sql/1.0/warehouses/58aa1b363649e722"
host = os.getenv("MY_SERVER_HOSTNAME")
http_path = os.getenv("MY_HTTP_PATH")
access_token = os.getenv("MY_TOKEN")

access_token = ""
connection = sql.connect(
server_hostname=host,
http_path=http_path,
access_token=access_token)


cursor = connection.cursor()
cursor.execute('SELECT :param `p`, * FROM RANGE(10)', {"param": "foo"})
cursor.execute("select * from `auto_maintenance_bugbash`.`tpcds_sf1000_naga_testv32`.`store_sales` LIMIT 1000")
# cursor.execute('SELECT 1')
result = cursor.fetchall()
result = cursor.fetchmany(10)
for row in result:
print(row)

Expand Down
Binary file not shown.
Binary file not shown.
56 changes: 20 additions & 36 deletions databricks_sql_connector/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,53 +10,37 @@ include = ["CHANGELOG.md"]

[tool.poetry.dependencies]
python = "^3.8.0"
#thrift = ">=0.16.0,<0.21.0"
#pandas = [
# { version = ">=1.2.5,<2.2.0", python = ">=3.8" }
#]
#pyarrow = ">=14.0.1,<17"

#lz4 = "^4.0.2"
#requests = "^2.18.1"
#oauthlib = "^3.1.0"
#numpy = [
# { version = "^1.16.6", python = ">=3.8,<3.11" },
# { version = "^1.23.4", python = ">=3.11" },
#]
# Remainging to add databricks_sql_connector_core
databricks_sqlalchemy = { version = ">=1.0.0", optional = true }
#openpyxl = "^3.0.10"
#alembic = { version = "^1.0.11", optional = true }
#urllib3 = ">=1.26"
#

[tool.poetry.extras]
databricks_sqlalchemy = ["databricks_sqlalchemy"]
#alembic = ["sqlalchemy", "alembic"]
#
#[tool.poetry.dev-dependencies]
#pytest = "^7.1.2"
#mypy = "^1.10.1"
#pylint = ">=2.12.0"
#black = "^22.3.0"
#pytest-dotenv = "^0.5.2"

#[tool.poetry.urls]
#"Homepage" = "https://github.com/databricks/databricks-sql-python"
#"Bug Tracker" = "https://github.com/databricks/databricks-sql-python/issues"
[tool.poetry.dev-dependencies]
pytest = "^7.1.2"
mypy = "^1.10.1"
pylint = ">=2.12.0"
black = "^22.3.0"
pytest-dotenv = "^0.5.2"

[tool.poetry.urls]
"Homepage" = "https://github.com/databricks/databricks-sql-python"
"Bug Tracker" = "https://github.com/databricks/databricks-sql-python/issues"

[tool.poetry.plugins."sqlalchemy.dialects"]
"databricks" = "databricks_sqlalchemy:DatabricksDialect"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
#
#[tool.mypy]
#ignore_missing_imports = "true"
#exclude = ['ttypes\.py$', 'TCLIService\.py$']
#
#[tool.black]
#exclude = '/(\.eggs|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|\.svn|_build|buck-out|build|dist|thrift_api)/'
#

[tool.mypy]
ignore_missing_imports = "true"
exclude = ['ttypes\.py$', 'TCLIService\.py$']

[tool.black]
exclude = '/(\.eggs|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|\.svn|_build|buck-out|build|dist|thrift_api)/'

[tool.pytest.ini_options]
markers = {"reviewed" = "Test case has been reviewed by Databricks"}
minversion = "6.0"
Expand Down
2 changes: 1 addition & 1 deletion setup_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ def build_and_install_library(directory_name):
if __name__ == "__main__":
build_and_install_library("databricks_sql_connector_core")
build_and_install_library("databricks_sql_connector")
build_and_install_library("databricks_sqlalchemy")
# build_and_install_library("databricks_sqlalchemy")