fix: fix quant linear autotune

OlivierDehaene · OlivierDehaene · commit 083c2de9f83c · 2023-12-14T16:45:47.000+01:00
diff --git a/server/text_generation_server/utils/gptq/custom_autotune.py b/server/text_generation_server/utils/gptq/custom_autotune.py
@@ -88,7 +88,7 @@ def kernel_call():
             # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
             # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
             return triton.testing.do_bench(
-                kernel_call, percentiles=(0.5, 0.2, 0.8), rep=40
+                kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40
             )
         except triton.OutOfResources:
             return (float("inf"), float("inf"), float("inf"))

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ def kernel_call():`
`88`	`88`	`# In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses`
`89`	`89`	`# PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default`
`90`	`90`	`return triton.testing.do_bench(`
`91`		`- kernel_call, percentiles=(0.5, 0.2, 0.8), rep=40`
	`91`	`+ kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40`
`92`	`92`	`)`
`93`	`93`	`except triton.OutOfResources:`
`94`	`94`	`return (float("inf"), float("inf"), float("inf"))`