Improve error handling for multigpu threads

2025-08-02 19:26:43 +00:00 · 2025-06-24 00:48:51 -05:00 · 2025-06-24 00:48:51 -05:00 · 44e053c26d
commit 44e053c26d
parent 9726eac475
1 changed files with 78 additions and 65 deletions
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@ -3,7 +3,7 @@ from __future__ import annotations
 import comfy.model_management
 from .k_diffusion import sampling as k_diffusion_sampling
 from .extra_samplers import uni_pc
-from typing import TYPE_CHECKING, Callable, NamedTuple
+from typing import TYPE_CHECKING, Callable, NamedTuple, Any
 if TYPE_CHECKING:
    from comfy.model_patcher import ModelPatcher
    from comfy.model_base import BaseModel
@ -428,8 +428,16 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t
            if batched_to_run_length >= conds_per_device:
                index_device += 1

-    thread_result = collections.namedtuple('thread_result', ['output', 'mult', 'area', 'batch_chunks', 'cond_or_uncond'])
+    class thread_result(NamedTuple):
+        output: Any
+        mult: Any
+        area: Any
+        batch_chunks: int
+        cond_or_uncond: Any
+        error: Exception = None
+
    def _handle_batch(device: torch.device, batch_tuple: tuple[comfy.hooks.HookGroup, tuple], results: list[thread_result]):
+        try:
            model_current: BaseModel = model_options["multigpu_clones"][device].model
            # run every hooked_to_run separately
            with torch.no_grad():
@ -496,6 +504,9 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t
                    else:
                        output = model_current.apply_model(input_x, timestep_, **c).to(output_device).chunk(batch_chunks)
                    results.append(thread_result(output, mult, area, batch_chunks, cond_or_uncond))
+        except Exception as e:
+            results.append(thread_result(None, None, None, None, None, error=e))
+            raise


    results: list[thread_result] = []
@ -508,7 +519,9 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t
    for thread in threads:
        thread.join()

-    for output, mult, area, batch_chunks, cond_or_uncond in results:
+    for output, mult, area, batch_chunks, cond_or_uncond, error in results:
+        if error is not None:
+            raise error
        for o in range(batch_chunks):
            cond_index = cond_or_uncond[o]
            a = area[o]