From bd9aa2b53de5a3ae4e3ef7f3df9c28adf82813cb Mon Sep 17 00:00:00 2001 From: Jothi Prakash Date: Thu, 29 Aug 2024 16:02:13 +0530 Subject: [PATCH] Added Benchmarking --- benchmarking/conftest.py | 44 +++++++ benchmarking/test_benchmark.py | 116 ++++++++++++++++++ check.py | 10 +- ...ricks_sql_connector-1.0.0-py3-none-any.whl | Bin 2295 -> 2353 bytes .../databricks_sql_connector-1.0.0.tar.gz | Bin 1678 -> 1546 bytes databricks_sql_connector/pyproject.toml | 56 +++------ setup_script.py | 2 +- 7 files changed, 186 insertions(+), 42 deletions(-) create mode 100644 benchmarking/conftest.py create mode 100644 benchmarking/test_benchmark.py diff --git a/benchmarking/conftest.py b/benchmarking/conftest.py new file mode 100644 index 00000000..67504328 --- /dev/null +++ b/benchmarking/conftest.py @@ -0,0 +1,44 @@ +import os +import pytest + + +@pytest.fixture(scope="session") +def benchmarking_host(): + return os.getenv("BENCHMARKING_SERVER_HOSTNAME") + + +@pytest.fixture(scope="session") +def benchmarking_http_path(): + return os.getenv("BENCHMARKING_HTTP_PATH") + + +@pytest.fixture(scope="session") +def benchmarking_access_token(): + return os.getenv("BENCHMARKING_TOKEN") + + +@pytest.fixture(scope="session") +def benchfood_host(): + return os.getenv("BENCHFOOD_SERVER_HOSTNAME") + + +@pytest.fixture(scope="session") +def benchfood_http_path(): + return os.getenv("BENCHFOOD_HTTP_PATH") + + +@pytest.fixture(scope="session") +def benchfood_access_token(): + return os.getenv("BENCHFOOD_TOKEN") + + +@pytest.fixture(scope="session", autouse=True) +def connection_details(benchmarking_host, benchmarking_http_path, benchmarking_access_token, benchfood_host, benchfood_http_path, benchfood_access_token): + return { + "benchmarking_host": benchmarking_host, + "benchmarking_http_path": benchmarking_http_path, + "benchmarking_access_token": benchmarking_access_token, + "benchfood_host": benchfood_host, + "benchfood_http_path": benchfood_http_path, + "benchfood_access_token": benchfood_access_token, + } diff --git a/benchmarking/test_benchmark.py b/benchmarking/test_benchmark.py new file mode 100644 index 00000000..93550b40 --- /dev/null +++ b/benchmarking/test_benchmark.py @@ -0,0 +1,116 @@ +import random +import time +from databricks import sql +import logging +import pytest +from contextlib import contextmanager +from datetime import datetime +log = logging.getLogger(__name__) + + +class TestBenchmarkingSuite: + + # TAG = "PRE-SPLIT" + TAG = "POST-SPLIT" + CATALOG_NAME = "main" + SCHEMA_NAME = "tpcds_sf100_delta" + TABLE_NAME = "catalog_sales" + RESULTS_TABLE = "main.pysql_benchmarking_schema.benchmarking_results" + ATTEMPTS = 10 + ROWS = 1000000 + LARGE_QUERY_LIMIT = 1000000 + SMALL_QUERY_LIMIT = 10000 + + @pytest.fixture(autouse=True) + def get_details(self, connection_details): + self.arguments = connection_details.copy() + + self.benchmarking_connection_params = { + "server_hostname": self.arguments["benchmarking_host"], + "http_path": self.arguments["benchmarking_http_path"], + "access_token": self.arguments["benchmarking_access_token"] + } + + self.benchfood_connection_params = { + "server_hostname": self.arguments["benchfood_host"], + "http_path": self.arguments["benchfood_http_path"], + "access_token": self.arguments["benchfood_access_token"] + } + + @contextmanager + def connection(self, connection_params): + log.info("Connecting with args: {}".format(connection_params)) + conn = sql.connect(**connection_params) + + try: + yield conn + finally: + conn.close() + + @contextmanager + def cursor(self, connection_params): + with self.connection(connection_params) as conn: + cursor = conn.cursor() + try: + yield cursor + finally: + cursor.close() + + def removed_outlier_mean(self, data): + total = 0 + for i in range(1, len(data)-1): + total += data[i] + + return total/(len(data)-2) + + def insert_benchmarking_results_data(self, function_name, query_time): + + log.info(f"Inserting results {self.TAG} - {function_name}") + with self.cursor(self.benchfood_connection_params) as cursor: + cursor.execute( + f"INSERT INTO {self.RESULTS_TABLE} (tag, function_name, compute_duration, date_time) VALUES ('{self.TAG}', '{function_name}', {query_time}, '{datetime.now()}')" + ) + + def get_query_time(self, query, expected_num_rows): + start_time = time.time() + with self.cursor(self.benchmarking_connection_params) as cursor: + cursor.execute(query) + result = cursor.fetchall() + log.info("Fetched {} rows".format(len(result))) + + assert len(result) == expected_num_rows + + end_time = time.time() + elapsed_time = end_time - start_time + + return elapsed_time + + def test_large_queries_performance(self): + compute_duration = [] + function_name = "large_query" + + for i in range(0, self.ATTEMPTS): + log.info("Attempt: {}".format(i)) + offset = i * self.LARGE_QUERY_LIMIT + random.randint(1, self.LARGE_QUERY_LIMIT) + + query = "select * from {}.{}.{} LIMIT {} OFFSET {}".format(self.CATALOG_NAME, self.SCHEMA_NAME, self.TABLE_NAME, self.LARGE_QUERY_LIMIT, offset) + compute_duration.append(self.get_query_time(query, self.LARGE_QUERY_LIMIT)) + + compute_duration.sort() + self.insert_benchmarking_results_data(function_name, self.removed_outlier_mean(compute_duration)) + + def test_small_queries_performance(self): + compute_duration = [] + function_name = "small_query" + + for i in range(0, self.ATTEMPTS): + log.info("Attempt: {}".format(i)) + offset = i * self.SMALL_QUERY_LIMIT + random.randint(1, self.SMALL_QUERY_LIMIT) + + query = "select * from {}.{}.{} LIMIT {} OFFSET {}".format(self.CATALOG_NAME, self.SCHEMA_NAME, self.TABLE_NAME, self.SMALL_QUERY_LIMIT, offset) + compute_duration.append(self.get_query_time(query, self.SMALL_QUERY_LIMIT)) + + compute_duration.sort() + self.insert_benchmarking_results_data(function_name, self.removed_outlier_mean(compute_duration)) + + diff --git a/check.py b/check.py index a9aa6c43..bfbbcf45 100644 --- a/check.py +++ b/check.py @@ -21,10 +21,10 @@ # Load environment variables from .env file # load_dotenv() -host = "e2-dogfood.staging.cloud.databricks.com" -http_path = "/sql/1.0/warehouses/58aa1b363649e722" +host = os.getenv("MY_SERVER_HOSTNAME") +http_path = os.getenv("MY_HTTP_PATH") +access_token = os.getenv("MY_TOKEN") -access_token = "" connection = sql.connect( server_hostname=host, http_path=http_path, @@ -32,9 +32,9 @@ cursor = connection.cursor() -cursor.execute('SELECT :param `p`, * FROM RANGE(10)', {"param": "foo"}) +cursor.execute("select * from `auto_maintenance_bugbash`.`tpcds_sf1000_naga_testv32`.`store_sales` LIMIT 1000") # cursor.execute('SELECT 1') -result = cursor.fetchall() +result = cursor.fetchmany(10) for row in result: print(row) diff --git a/databricks_sql_connector/dist/databricks_sql_connector-1.0.0-py3-none-any.whl b/databricks_sql_connector/dist/databricks_sql_connector-1.0.0-py3-none-any.whl index 4cd32c830d241b29116f83314aabfca60d776a10..dee9fe011903c5c858e5e92f3a76cec6761d3164 100644 GIT binary patch delta 917 zcmV;G18V&D5wQ}my8#ZfVzicc0RRA~0{{Rq0F$r?D}PNzR6#^RR6(6nO-}+b5WV+T zOgL#mclq#us}K=PG(kmCucj=+QlahAc0vBVvwVmUJ*+7>>?(rpHvtAuJ?QKRFs9XC z`H9i2Cx2tqj?~+!h$qNvB+$<2$^PVTuQ)0pXMa24^sGHCGIX^E1+h`=8Rwa_==dgZ z1g&O0Xatq=<=NVGXC|DJV~G@aYYL(W9tU@1EwyHWqx`(F2?I0sOJQ*bJ)rlkNvu#x z>I8a1Cz(jrTARrEu3i)$Gzv4$3p1*f;*@G}N`H+JY#~QfiBt3KxL68Z?_w`c<%$$Niv2;jhoV2(GgPUaW|xXv;A1K6OzTCelx` zVcsy|UL!o}w{)FND4omD9g5x>g()TdFGbcx7h9Qn8YWKD;!%74vPx+>JmnVBR82=_ zsRJ2fU$Z&`H31I!uNUQ%0RRBB0ssIn0F$r?F_W$Y3x7p8W)O=HIe-BJ!5e0YeNkZ+ zOMnGN0xwTLxlAUN$|0$~e}8vX=P6H*J;`eJj1ZxOBQ)vmfZ_Cyth_+gw6u`!OG3X&5ruDFzvDaqYvUgHg|b2(?cz zdWTWgQd(rZ?FDf%eV?2#E)ZI_&r2)RO+C~P0^+}a%MEDF0xu}tMm=a zhKqc2cu*T(`wYw^<>#Tqu`K^zg>7a~G1bbRJH_fDyBApc&>w_NpY%36mHylDSZBmN zTu(sz#We-x{T+2Jp`1F#!p)Vzicc0RRA~lY0kn0(1kDum~%Y7zi%`6qBzAER%Ey rARGCw7v+=z006WC001ul0000000000005)`o0G2yLI!*W00000{uGug delta 857 zcmV-f1E&136890Xy8#V5Grs>m0RR990{{RqlaB)|e@#VHK}11RL6uU=PQx$|y!$Iw zJ%NNn(sC$)R8%boqymK?uE;oUYBjb~+iA()v*{ZIha4=~yEC&R?>q|6GUge57uHFm zW9SAEd16Y$(9}Pz?vIg!@vHMwOAYh3Mr%8{x%JxOgMMqqb!$cxwx>kxDP4lFOqI-V7x!P@vfuk;1^%$Aj~g3H=TAW& zM-ZG1l1=K);RZyb21x>)KLtBvEF8D8^cYr(Y4k#S`c{{iK9Af>Ozk2wYfT9G22e`_ z1d{;>8ncoE7XbvKvB%w$tprAY1soH^lEWOpfPvtRHTFe?0f7vNMFQJ-`pG3TsZD}bC7OP;gB5wJR|F`JSeCMrU<_N@jnU#s0;zIPy99dOMDxGRaN)qLNQY``S1Q=a8 zpnL@4K-bZ$BS&`OHCMStsw6w{pqP>5& z<*wI=yN2Y3NT|v&x*q3z21wT>9LPGJZ*lE`ja|W!asp64!SEf1Sw~2bv2Nsr#qxb} zOc{>}>K->{s2N(QeT15SSPDXrSCJJWTN$dfdrGGqI{~?|v1{|7+*>}9;#WiE6h;1d zD6vh``>4?UGms7CVy*0A`zAR9KpyZWem^FoMN8zpp3ilLozu1C)L&dv5H{Xn-(kY& zg5B7^rz!TfTzeZivP-aa(c9M3iZG2Y64*ueV=T^MdZe~9yBVhofS>n#bG7pgj(hmC zdj>H92|F{s|2_c#00@({2XF#91Cx*gFq2mZDgxRAlQ0B3lfDQL9HOzu-IM_U0JH)C j051Rl00000000000Hgt71e1^hE0Yxo76w8F00000y840& diff --git a/databricks_sql_connector/dist/databricks_sql_connector-1.0.0.tar.gz b/databricks_sql_connector/dist/databricks_sql_connector-1.0.0.tar.gz index 05718f9200bb20aa356c7fb04d5f78ee309956fe..f6855cee8aad48e3b8695135a9b82efe40bfc034 100644 GIT binary patch delta 1517 zcmV4T=nqFMsV*+cp^Iv;GR5w>#(#E4E`hX;yk=z*=s!Wx)1@W`ipyA z?>e6AZQzTI2S6zm6)^n9j|s1hQ^ixZ3N>pBth5={eN}~Z5Vv_h^eAI!TWDz zB;O@bKhHAOSGllV;t;2Ca7cJ<{t-Tk! zubv+qy?(Cy>mszggd}DK%VL)GIg?#fRP)sx{nI3E&C^f#gz3o);SZ}E(>TWKY!L1Q zLsy^D1g4ixpvw<pjL^J;g2f}vI`|Bu(qz-bL++@H%xSFUz;qG zs+w$zxPQd6P&z3lRoNx3LtI$=Ndj`L8TN=oBR4na+s=q6_ zsI}Uv$F{Zh%=Ds*YWkj@WEShOrW|T2l2VyxP@n~$&p-WVRTI1HmHoGNUaHHw7;~vE)kyGxieOTY z8k=*$RLVto#-rL)K!;K)K=gW6A1uMfoV{mpC66<$(0vN7WkeR{O%>Li(|H08LhrTERzp0`Kyy$(N|9DQ|nDd_> z==`U7-T%AB|38TTZ{6ts;ho&Lf8+mG{XdZ0AqITb{|BuV|L-becp`800{GeZ9|o)Me=QULzoqz3DxsTQ zfQkS26aUTm@BbG6Z$AGGgZ5*y0Wk6Zdi=lLt~-wT75iT@7)B_zz*-^lF^hG%W)q+00|j1<%}B2_ zKNk5=&@|O9?1*N=k`AlB>XG$b^S1P)ed`vFZ{5OmZr{YceG?A>*_g*5ivyi~wkpqQ zL|8~{pG(zQ6E?P{DNT>?uV%Q)Wy3qy5*BAUYkFLrylVe-a@4^~Szg=t?kQeZJDp{p z-Kj^kzsQ}m!CDl{lzlcSz&aGnt)cUh;{1JvklBkH7#J8B7#J8B7#J8B7#J8B7#J8B T7#J8Bd=dTw{rGL`08jt`%TE*~ delta 1650 zcmV-&295cO42}(uFMn-w+Bgu-XZ;F|b2DjgK-k6@$|afRl3bg)w9Pe{`;t`BB|{)2w3l|0>$Q4j09)2dEA8&Hwpr33ji)Dc@|@9#iH&=KU18m?Z8yEe zv1)g0*Ks%SWa9x)7Lp2>e&ff4+rf$C3EOenuGe&3*J+ZL-+%PIP8)3)eD{c`q$}221W}@I(syb;8u~5W{Q1D#NS`V&7@#!DOcz-U62k<$W z%2AeDh?ejx3g~~nEU>+9-g-iLD&YgOKV5VpVV3M#JrwhhrG?gfFQ?&%)m^2TfSgvX z=jt}bLU8>Z9Wp)l2h7tjE~Aw-TQxioS)v*X5nA|T4}UHCom>0Q_g+3bJbv{|jn_kH zc?(Iza+XFc4LK`%DA(&%1HEmMj?!sUjszb_)wH`~6UQfZZQCZUqc7$(ji^=)ZDPO& zytLoDI}UM4tG4YDwY(Z$jJ~@A!-Jw|vj>ArsSw%4OvD2#Ikj!4Z6UN7|LtjIRHF;? z3HH8Zg?}t`qf49)aiBF*=81WKrgx=US|<5)jb?8hwGjWNO0{hecR;b)?NV-&csHcx z0zGt{oM;S|O_UBdw#C-_k!dHSF|XeUrrK)SM7FkWK8+!ks&#P}M;ts%GhzJdWzcw2&<6Ar(1CkTciAt)clE54m(^*+3wc>lUjS=+ z+}K(~kM=XdQyx@d2i}xW0WP4H_Q4Wt%-IEtv|Y}We}@#fegr((XDr3@>$pF~3x5My zA5p^7rQr6JQ^(mb2xG3!2Q)6&RZ9@FG1Dw@^zye?7E&tZRE{b;sxy@ptaW!w4V5Q) z2zDVD@R%*62sFV3^ZaL?|Nc+Uf1d4jTITuBJpTc&Eoa;y0GgIT0Hl4N@Bdu8Wt;bZ zKM?;nZ}Pum`2RupzkMtJyKUFdzklKXRsM%AUyuVn%m1zR3jaGz*EanB5YR8W);U3- zpGlz!SI?UGXA>|c|C{{({pWwL>6!d*^8ao5zmz;K?(_io+5GRfR-gacCjWm+`JZT` zo4Wv$|L-UNoA@A4YpbNSzOy_Nj$`hTsq$^Q=l zDW+YdN;7V`5tx@Q%+o$%laS@I>OX6|pd#hzu+VMu6QT|tSw|`CZ5*!|wfaXM_t^k$ zGH|K>(#CSc3+u6o%5NtAPkw(^SO18Ab^iZE{5zgwujGHP)in424}up=Qso2nKdVxS zE_R87Uebhh@w#d}bk$co34b~*lY|0~;D1s3hmNaSy)Ir<>Y+Vd4!A5oYpSz^@_4-_ zpQsB&uV3OlChu%tshRyDIaYTmU!>D*c3$(qhNJjKZgz$wW)RRM#)lHKbz*@eG+6F ws>~?LE>eWdy{>_Qfq{X6fq{X6fq{X6fq{X6fq{X6!FPjy0G+EELI6+z0B)H}v;Y7A diff --git a/databricks_sql_connector/pyproject.toml b/databricks_sql_connector/pyproject.toml index d0c4aafb..32b72b54 100644 --- a/databricks_sql_connector/pyproject.toml +++ b/databricks_sql_connector/pyproject.toml @@ -10,38 +10,22 @@ include = ["CHANGELOG.md"] [tool.poetry.dependencies] python = "^3.8.0" -#thrift = ">=0.16.0,<0.21.0" -#pandas = [ -# { version = ">=1.2.5,<2.2.0", python = ">=3.8" } -#] -#pyarrow = ">=14.0.1,<17" - -#lz4 = "^4.0.2" -#requests = "^2.18.1" -#oauthlib = "^3.1.0" -#numpy = [ -# { version = "^1.16.6", python = ">=3.8,<3.11" }, -# { version = "^1.23.4", python = ">=3.11" }, -#] +# Remainging to add databricks_sql_connector_core databricks_sqlalchemy = { version = ">=1.0.0", optional = true } -#openpyxl = "^3.0.10" -#alembic = { version = "^1.0.11", optional = true } -#urllib3 = ">=1.26" -# + [tool.poetry.extras] databricks_sqlalchemy = ["databricks_sqlalchemy"] -#alembic = ["sqlalchemy", "alembic"] -# -#[tool.poetry.dev-dependencies] -#pytest = "^7.1.2" -#mypy = "^1.10.1" -#pylint = ">=2.12.0" -#black = "^22.3.0" -#pytest-dotenv = "^0.5.2" -#[tool.poetry.urls] -#"Homepage" = "https://github.com/databricks/databricks-sql-python" -#"Bug Tracker" = "https://github.com/databricks/databricks-sql-python/issues" +[tool.poetry.dev-dependencies] +pytest = "^7.1.2" +mypy = "^1.10.1" +pylint = ">=2.12.0" +black = "^22.3.0" +pytest-dotenv = "^0.5.2" + +[tool.poetry.urls] +"Homepage" = "https://github.com/databricks/databricks-sql-python" +"Bug Tracker" = "https://github.com/databricks/databricks-sql-python/issues" [tool.poetry.plugins."sqlalchemy.dialects"] "databricks" = "databricks_sqlalchemy:DatabricksDialect" @@ -49,14 +33,14 @@ databricks_sqlalchemy = ["databricks_sqlalchemy"] [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" -# -#[tool.mypy] -#ignore_missing_imports = "true" -#exclude = ['ttypes\.py$', 'TCLIService\.py$'] -# -#[tool.black] -#exclude = '/(\.eggs|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|\.svn|_build|buck-out|build|dist|thrift_api)/' -# + +[tool.mypy] +ignore_missing_imports = "true" +exclude = ['ttypes\.py$', 'TCLIService\.py$'] + +[tool.black] +exclude = '/(\.eggs|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|\.svn|_build|buck-out|build|dist|thrift_api)/' + [tool.pytest.ini_options] markers = {"reviewed" = "Test case has been reviewed by Databricks"} minversion = "6.0" diff --git a/setup_script.py b/setup_script.py index 27b021cb..6b12fe9d 100644 --- a/setup_script.py +++ b/setup_script.py @@ -28,4 +28,4 @@ def build_and_install_library(directory_name): if __name__ == "__main__": build_and_install_library("databricks_sql_connector_core") build_and_install_library("databricks_sql_connector") - build_and_install_library("databricks_sqlalchemy") \ No newline at end of file + # build_and_install_library("databricks_sqlalchemy") \ No newline at end of file