diff --git a/README.md b/README.md
index 8234af02..93c7b3ec 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,8 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 - Saving/Loading workflows as Json files.
 - Nodes interface can be used to create complex workflows like one for [Hires fix](https://comfyanonymous.github.io/ComfyUI_examples/2_pass_txt2img/) or much more advanced ones.
 - [Area Composition](https://comfyanonymous.github.io/ComfyUI_examples/area_composition/)
+- [Inpainting](https://comfyanonymous.github.io/ComfyUI_examples/inpaint/) with both regular and inpainting models.
+- [ControlNet](https://comfyanonymous.github.io/ComfyUI_examples/controlnet/)
 - Starts up very fast.
 - Works fully offline: will never download anything.
 
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 1301f746..8c859d3f 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -3,6 +3,7 @@ CPU = 0
 NO_VRAM = 1
 LOW_VRAM = 2
 NORMAL_VRAM = 3
+HIGH_VRAM = 4
 
 accelerate_enabled = False
 vram_state = NORMAL_VRAM
@@ -27,10 +28,11 @@ if "--lowvram" in sys.argv:
     set_vram_to = LOW_VRAM
 if "--novram" in sys.argv:
     set_vram_to = NO_VRAM
+if "--highvram" in sys.argv:
+    vram_state = HIGH_VRAM
 
 
-
-if set_vram_to != NORMAL_VRAM:
+if set_vram_to == LOW_VRAM or set_vram_to == NO_VRAM:
     try:
         import accelerate
         accelerate_enabled = True
@@ -44,7 +46,7 @@ if set_vram_to != NORMAL_VRAM:
     total_vram_available_mb = int(max(256, total_vram_available_mb))
 
 
-print("Set vram state to:", ["CPU", "NO VRAM", "LOW VRAM", "NORMAL VRAM"][vram_state])
+print("Set vram state to:", ["CPU", "NO VRAM", "LOW VRAM", "NORMAL VRAM", "HIGH VRAM"][vram_state])
 
 
 current_loaded_model = None
@@ -57,18 +59,24 @@ def unload_model():
     global current_loaded_model
     global model_accelerated
     global current_gpu_controlnets
+    global vram_state
+
     if current_loaded_model is not None:
         if model_accelerated:
             accelerate.hooks.remove_hook_from_submodules(current_loaded_model.model)
             model_accelerated = False
 
-        current_loaded_model.model.cpu()
+        #never unload models from GPU on high vram
+        if vram_state != HIGH_VRAM:
+            current_loaded_model.model.cpu()
         current_loaded_model.unpatch_model()
         current_loaded_model = None
-    if len(current_gpu_controlnets) > 0:
-        for n in current_gpu_controlnets:
-            n.cpu()
-        current_gpu_controlnets = []
+
+    if vram_state != HIGH_VRAM:
+        if len(current_gpu_controlnets) > 0:
+            for n in current_gpu_controlnets:
+                n.cpu()
+            current_gpu_controlnets = []
 
 
 def load_model_gpu(model):
@@ -87,7 +95,7 @@ def load_model_gpu(model):
     current_loaded_model = model
     if vram_state == CPU:
         pass
-    elif vram_state == NORMAL_VRAM:
+    elif vram_state == NORMAL_VRAM or vram_state == HIGH_VRAM:
         model_accelerated = False
         real_model.cuda()
     else:
diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py
index 2b94d281..998babe8 100644
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -178,7 +178,6 @@ def load_embed(embedding_name, embedding_directory):
                 valid_file = t
                 break
         if valid_file is None:
-            print("warning, embedding {} does not exist, ignoring".format(embed_path))
             return None
         else:
             embed_path = valid_file
@@ -187,7 +186,10 @@ def load_embed(embedding_name, embedding_directory):
         import safetensors.torch
         embed = safetensors.torch.load_file(embed_path, device="cpu")
     else:
-        embed = torch.load(embed_path, weights_only=True, map_location="cpu")
+        if 'weights_only' in torch.load.__code__.co_varnames:
+            embed = torch.load(embed_path, weights_only=True, map_location="cpu")
+        else:
+            embed = torch.load(embed_path, map_location="cpu")
     if 'string_to_param' in embed:
         values = embed['string_to_param'].values()
     else:
@@ -218,18 +220,28 @@ class SD1Tokenizer:
         tokens = []
         for t in parsed_weights:
             to_tokenize = unescape_important(t[0]).replace("\n", " ").split(' ')
-            for word in to_tokenize:
+            while len(to_tokenize) > 0:
+                word = to_tokenize.pop(0)
                 temp_tokens = []
                 embedding_identifier = "embedding:"
                 if word.startswith(embedding_identifier) and self.embedding_directory is not None:
                     embedding_name = word[len(embedding_identifier):].strip('\n')
                     embed = load_embed(embedding_name, self.embedding_directory)
+                    if embed is None:
+                        stripped = embedding_name.strip(',')
+                        if len(stripped) < len(embedding_name):
+                            embed = load_embed(stripped, self.embedding_directory)
+                            if embed is not None:
+                                to_tokenize.insert(0, embedding_name[len(stripped):])
+
                     if embed is not None:
                         if len(embed.shape) == 1:
                             temp_tokens += [(embed, t[1])]
                         else:
                             for x in range(embed.shape[0]):
                                 temp_tokens += [(embed[x], t[1])]
+                    else:
+                        print("warning, embedding:{} does not exist, ignoring".format(embedding_name))
                 elif len(word) > 0:
                     tt = self.tokenizer(word)["input_ids"][1:-1]
                     for x in tt:
diff --git a/main.py b/main.py
index f5aec442..54c66dac 100644
--- a/main.py
+++ b/main.py
@@ -29,6 +29,7 @@ if __name__ == "__main__":
         print("\t--dont-upcast-attention\t\tDisable upcasting of attention \n\t\t\t\t\tcan boost speed but increase the chances of black images.\n")
         print("\t--use-split-cross-attention\tUse the split cross attention optimization instead of the sub-quadratic one.\n\t\t\t\t\tIgnored when xformers is used.")
         print()
+        print("\t--highvram\t\t\tBy default models will be unloaded to CPU memory after being used.\n\t\t\t\t\tThis option keeps them in GPU memory.\n")
         print("\t--normalvram\t\t\tUsed to force normal vram use if lowvram gets automatically enabled.")
         print("\t--lowvram\t\t\tSplit the unet in parts to use less vram.")
         print("\t--novram\t\t\tWhen lowvram isn't enough.")
@@ -208,6 +209,7 @@ class PromptExecutor:
                 executed = set(executed)
                 for x in executed:
                     self.old_prompt[x] = copy.deepcopy(prompt[x])
+        torch.cuda.empty_cache()
 
 def validate_inputs(prompt, item):
     unique_id = item
diff --git a/models/configs/v1-inpainting-inference.yaml b/models/configs/v1-inpainting-inference.yaml
new file mode 100644
index 00000000..45f3f82d
--- /dev/null
+++ b/models/configs/v1-inpainting-inference.yaml
@@ -0,0 +1,71 @@
+model:
+  base_learning_rate: 7.5e-05
+  target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: hybrid   # important
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    finetune_keys: null
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 2500 ] # NOTE for resuming. use 10000 if starting from scratch
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 9  # 4 data + 4 downscaled image + 1 mask
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
diff --git a/nodes.py b/nodes.py
index b35c09de..ef1201de 100644
--- a/nodes.py
+++ b/nodes.py
@@ -759,7 +759,7 @@ def load_custom_nodes():
         module_path = os.path.join(CUSTOM_NODE_PATH, possible_module)
         if os.path.isfile(module_path) and os.path.splitext(module_path)[1] != ".py": continue
 
-        module_name = "custom_node_module.{}".format(possible_module)
+        module_name = possible_module
         try:
             if os.path.isfile(module_path):
                 module_spec = importlib.util.spec_from_file_location(module_name, module_path)
diff --git a/notebooks/comfyui_colab.ipynb b/notebooks/comfyui_colab.ipynb
index cdf182b8..2e364f16 100644
--- a/notebooks/comfyui_colab.ipynb
+++ b/notebooks/comfyui_colab.ipynb
@@ -85,7 +85,7 @@
     {
       "cell_type": "markdown",
       "source": [
-        "Run ComfyUI:"
+        "Run ComfyUI (use the fp16 model configs for more speed):"
       ],
       "metadata": {
         "id": "gggggggggg"
@@ -112,7 +112,7 @@
         "\n",
         "threading.Thread(target=iframe_thread, daemon=True, args=(8188,)).start()\n",
         "\n",
-        "!python main.py"
+        "!python main.py --highvram"
       ],
       "metadata": {
         "id": "hhhhhhhhhh"