Add --fast argument to enable experimental optimizations.

Optimizations that might break things/lower quality will be put behind this flag first and might be enabled by default in the future. Currently the only optimization is float8_e4m3fn matrix multiplication on 4000/ADA series Nvidia cards or later. If you have one of these cards you will see a speed boost when using fp8_e4m3fn flux for example.
2025-09-13 13:05:07 +00:00 · 2024-08-20 11:49:33 -04:00
parent d1a6bd6845
commit 9953f22fce
4 changed files with 52 additions and 5 deletions
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -96,10 +96,7 @@ class BaseModel(torch.nn.Module):

        if not unet_config.get("disable_unet_model_creation", False):
            if model_config.custom_operations is None:
-                if self.manual_cast_dtype is not None:
-                    operations = comfy.ops.manual_cast
-                else:
-                    operations = comfy.ops.disable_weight_init
+                operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype)
            else:
                operations = model_config.custom_operations
            self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)