Make applying embeddings more efficient.

Adding new tokens no longer makes a whole copy of the embeddings weight which can be massive on certain models.
2025-09-15 05:57:57 +00:00 · 2025-03-05 17:34:38 -05:00
parent 5d84607bf3
commit 85ef295069
5 changed files with 81 additions and 61 deletions
--- a/comfy/text_encoders/bert.py
+++ b/comfy/text_encoders/bert.py
@@ -93,8 +93,11 @@ class BertEmbeddings(torch.nn.Module):

        self.LayerNorm = operations.LayerNorm(embed_dim, eps=layer_norm_eps, dtype=dtype, device=device)

-    def forward(self, input_tokens, token_type_ids=None, dtype=None):
-        x = self.word_embeddings(input_tokens, out_dtype=dtype)
+    def forward(self, input_tokens, embeds=None, token_type_ids=None, dtype=None):
+        if embeds is not None:
+            x = embeds
+        else:
+            x = self.word_embeddings(input_tokens, out_dtype=dtype)
        x += comfy.ops.cast_to_input(self.position_embeddings.weight[:x.shape[1]], x)
        if token_type_ids is not None:
            x += self.token_type_embeddings(token_type_ids, out_dtype=x.dtype)
@@ -113,8 +116,8 @@ class BertModel_(torch.nn.Module):
        self.embeddings = BertEmbeddings(config_dict["vocab_size"], config_dict["max_position_embeddings"], config_dict["type_vocab_size"], config_dict["pad_token_id"], embed_dim, layer_norm_eps, dtype, device, operations)
        self.encoder = BertEncoder(config_dict["num_hidden_layers"], embed_dim, config_dict["intermediate_size"], config_dict["num_attention_heads"], layer_norm_eps, dtype, device, operations)

-    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
-        x = self.embeddings(input_tokens, dtype=dtype)
+    def forward(self, input_tokens, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+        x = self.embeddings(input_tokens, embeds=embeds, dtype=dtype)
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -241,8 +241,11 @@ class Llama2_(nn.Module):
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
        # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)

-    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
-        x = self.embed_tokens(x, out_dtype=dtype)
+    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+        if embeds is not None:
+            x = embeds
+        else:
+            x = self.embed_tokens(x, out_dtype=dtype)

        if self.normalize_in:
            x *= self.config.hidden_size ** 0.5
--- a/comfy/text_encoders/t5.py
+++ b/comfy/text_encoders/t5.py
@@ -239,8 +239,11 @@ class T5(torch.nn.Module):
    def set_input_embeddings(self, embeddings):
        self.shared = embeddings

-    def forward(self, input_ids, *args, **kwargs):
-        x = self.shared(input_ids, out_dtype=kwargs.get("dtype", torch.float32))
+    def forward(self, input_ids, attention_mask, embeds=None, num_tokens=None, **kwargs):
+        if input_ids is None:
+            x = embeds
+        else:
+            x = self.shared(input_ids, out_dtype=kwargs.get("dtype", torch.float32))
        if self.dtype not in [torch.float32, torch.float16, torch.bfloat16]:
            x = torch.nan_to_num(x) #Fix for fp8 T5 base
-        return self.encoder(x, *args, **kwargs)
+        return self.encoder(x, attention_mask=attention_mask, **kwargs)