diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index de3e85c08..72eeaea9a 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -143,6 +143,7 @@ class PerformanceFeature(enum.Enum):
     Fp16Accumulation = "fp16_accumulation"
     Fp8MatrixMultiplication = "fp8_matrix_mult"
     CublasOps = "cublas_ops"
+    AutoTune = "autotune"
 
 parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")
 
diff --git a/comfy/ops.py b/comfy/ops.py
index 18e7db705..55e958adb 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -52,6 +52,9 @@ except (ModuleNotFoundError, TypeError):
 
 cast_to = comfy.model_management.cast_to #TODO: remove once no more references
 
+if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
+    torch.backends.cudnn.benchmark = True
+
 def cast_to_input(weight, input, non_blocking=False, copy=True):
     return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)