Add Hunyuan 3D 2.1 Support (#8714)

2025-09-11 12:06:23 +00:00 · 2025-09-05 03:36:20 +03:00
parent a9f1bb10a5
commit 261421e218
13 changed files with 1537 additions and 129 deletions
--- a/comfy/ldm/hunyuan3d/vae.py
+++ b/comfy/ldm/hunyuan3d/vae.py
@@ -4,81 +4,458 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-
-
-from typing import Union, Tuple, List, Callable, Optional
-
 import numpy as np
-from einops import repeat, rearrange
+import math
 from tqdm import tqdm
+
+from typing import Optional
+
 import logging

 import comfy.ops
 ops = comfy.ops.disable_weight_init

-def generate_dense_grid_points(
-    bbox_min: np.ndarray,
-    bbox_max: np.ndarray,
-    octree_resolution: int,
-    indexing: str = "ij",
-):
-    length = bbox_max - bbox_min
-    num_cells = octree_resolution
+def fps(src: torch.Tensor, batch: torch.Tensor, sampling_ratio: float, start_random: bool = True):

-    x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
-    y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
-    z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
-    [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
-    xyz = np.stack((xs, ys, zs), axis=-1)
-    grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
+    # manually create the pointer vector
+    assert src.size(0) == batch.numel()

-    return xyz, grid_size, length
+    batch_size = int(batch.max()) + 1
+    deg = src.new_zeros(batch_size, dtype = torch.long)
+
+    deg.scatter_add_(0, batch, torch.ones_like(batch))
+
+    ptr_vec = deg.new_zeros(batch_size + 1)
+    torch.cumsum(deg, 0, out=ptr_vec[1:])
+
+    #return fps_sampling(src, ptr_vec, ratio)
+    sampled_indicies = []
+
+    for b in range(batch_size):
+        # start and the end of each batch
+        start, end = ptr_vec[b].item(), ptr_vec[b + 1].item()
+        # points from the point cloud
+        points = src[start:end]
+
+        num_points = points.size(0)
+        num_samples = max(1, math.ceil(num_points * sampling_ratio))
+
+        selected = torch.zeros(num_samples, device = src.device, dtype = torch.long)
+        distances = torch.full((num_points,), float("inf"), device = src.device)
+
+        # select a random start point
+        if start_random:
+            farthest = torch.randint(0, num_points, (1,), device = src.device)
+        else:
+            farthest = torch.tensor([0], device = src.device, dtype = torch.long)
+
+        for i in range(num_samples):
+            selected[i] = farthest
+            centroid = points[farthest].squeeze(0)
+            dist = torch.norm(points - centroid, dim = 1) # compute euclidean distance
+            distances = torch.minimum(distances, dist)
+            farthest = torch.argmax(distances)
+
+        sampled_indicies.append(torch.arange(start, end)[selected])
+
+    return torch.cat(sampled_indicies, dim = 0)
+class PointCrossAttention(nn.Module):
+    def __init__(self,
+        num_latents: int,
+        downsample_ratio: float,
+        pc_size: int,
+        pc_sharpedge_size: int,
+        point_feats: int,
+        width: int,
+        heads: int,
+        layers: int,
+        fourier_embedder,
+        normal_pe: bool = False,
+        qkv_bias: bool = False,
+        use_ln_post: bool = True,
+        qk_norm: bool = True):
+
+        super().__init__()
+
+        self.fourier_embedder = fourier_embedder
+
+        self.pc_size = pc_size
+        self.normal_pe = normal_pe
+        self.downsample_ratio = downsample_ratio
+        self.pc_sharpedge_size = pc_sharpedge_size
+        self.num_latents = num_latents
+        self.point_feats = point_feats
+
+        self.input_proj = nn.Linear(self.fourier_embedder.out_dim + point_feats, width)
+
+        self.cross_attn = ResidualCrossAttentionBlock(
+            width = width,
+            heads = heads,
+            qkv_bias = qkv_bias,
+            qk_norm = qk_norm
+        )
+
+        self.self_attn = None
+        if layers > 0:
+            self.self_attn = Transformer(
+                width = width,
+                heads = heads,
+                qkv_bias = qkv_bias,
+                qk_norm = qk_norm,
+                layers = layers
+            )
+
+        if use_ln_post:
+            self.ln_post = nn.LayerNorm(width)
+        else:
+            self.ln_post = None
+
+    def sample_points_and_latents(self, point_cloud: torch.Tensor, features: torch.Tensor):
+
+        """
+        Subsample points randomly from the point cloud (input_pc)
+        Further sample the subsampled points to get query_pc
+        take the fourier embeddings for both input and query pc
+
+        Mental Note: FPS-sampled points (query_pc) act as latent tokens that attend to and learn from the broader context in input_pc.
+        Goal: get a smaller represenation (query_pc) to represent the entire scence structure by learning from a broader subset (input_pc).
+        More computationally efficient.
+
+        Features are additional information for each point in the cloud
+        """
+
+        B, _, D = point_cloud.shape
+
+        num_latents = int(self.num_latents)
+
+        num_random_query = self.pc_size / (self.pc_size + self.pc_sharpedge_size) * num_latents
+        num_sharpedge_query = num_latents - num_random_query
+
+        # Split random and sharpedge surface points
+        random_pc, sharpedge_pc = torch.split(point_cloud, [self.pc_size, self.pc_sharpedge_size], dim=1)
+
+        # assert statements
+        assert random_pc.shape[1] <= self.pc_size, "Random surface points size must be less than or equal to pc_size"
+        assert sharpedge_pc.shape[1] <= self.pc_sharpedge_size, "Sharpedge surface points size must be less than or equal to pc_sharpedge_size"
+
+        input_random_pc_size = int(num_random_query * self.downsample_ratio)
+        random_query_pc, random_input_pc, random_idx_pc, random_idx_query = \
+            self.subsample(pc = random_pc, num_query = num_random_query, input_pc_size = input_random_pc_size)
+
+        input_sharpedge_pc_size = int(num_sharpedge_query * self.downsample_ratio)
+
+        if input_sharpedge_pc_size == 0:
+            sharpedge_input_pc = torch.zeros(B, 0, D, dtype = random_input_pc.dtype).to(point_cloud.device)
+            sharpedge_query_pc = torch.zeros(B, 0, D, dtype= random_query_pc.dtype).to(point_cloud.device)
+
+        else:
+            sharpedge_query_pc, sharpedge_input_pc, sharpedge_idx_pc, sharpedge_idx_query = \
+            self.subsample(pc = sharpedge_pc, num_query = num_sharpedge_query, input_pc_size = input_sharpedge_pc_size)
+
+        # concat the random and sharpedges
+        query_pc = torch.cat([random_query_pc, sharpedge_query_pc], dim = 1)
+        input_pc = torch.cat([random_input_pc, sharpedge_input_pc], dim = 1)
+
+        query = self.fourier_embedder(query_pc)
+        data = self.fourier_embedder(input_pc)
+
+        if self.point_feats > 0:
+            random_surface_features, sharpedge_surface_features = torch.split(features, [self.pc_size, self.pc_sharpedge_size], dim = 1)
+
+            input_random_surface_features, query_random_features = \
+                self.handle_features(features = random_surface_features, idx_pc = random_idx_pc, batch_size = B,
+                                     input_pc_size = input_random_pc_size, idx_query = random_idx_query)
+
+            if input_sharpedge_pc_size == 0:
+                input_sharpedge_surface_features = torch.zeros(B, 0, self.point_feats,
+                                                               dtype = input_random_surface_features.dtype, device = point_cloud.device)
+
+                query_sharpedge_features = torch.zeros(B, 0, self.point_feats,
+                                                       dtype = query_random_features.dtype, device = point_cloud.device)
+            else:
+
+                input_sharpedge_surface_features, query_sharpedge_features = \
+                    self.handle_features(idx_pc = sharpedge_idx_pc, features = sharpedge_surface_features,
+                                         batch_size = B, idx_query = sharpedge_idx_query, input_pc_size = input_sharpedge_pc_size)
+
+            query_features = torch.cat([query_random_features, query_sharpedge_features], dim = 1)
+            input_features = torch.cat([input_random_surface_features, input_sharpedge_surface_features], dim = 1)
+
+            if self.normal_pe:
+                # apply the fourier embeddings on the first 3 dims (xyz)
+                input_features_pe = self.fourier_embedder(input_features[..., :3])
+                query_features_pe = self.fourier_embedder(query_features[..., :3])
+                # replace the first 3 dims with the new PE ones
+                input_features = torch.cat([input_features_pe, input_features[..., :3]], dim = -1)
+                query_features = torch.cat([query_features_pe, query_features[..., :3]], dim = -1)
+
+            # concat at the channels dim
+            query = torch.cat([query, query_features], dim = -1)
+            data = torch.cat([data, input_features], dim = -1)
+
+        # don't return pc_info to avoid unnecessary memory usuage
+        return query.view(B, -1, query.shape[-1]), data.view(B, -1, data.shape[-1])
+
+    def forward(self, point_cloud: torch.Tensor, features: torch.Tensor):
+
+        query, data = self.sample_points_and_latents(point_cloud = point_cloud, features = features)
+
+        # apply projections
+        query = self.input_proj(query)
+        data = self.input_proj(data)
+
+        # apply cross attention between query and data
+        latents = self.cross_attn(query, data)
+
+        if self.self_attn is not None:
+            latents = self.self_attn(latents)
+
+        if self.ln_post is not None:
+            latents = self.ln_post(latents)
+
+        return latents


-class VanillaVolumeDecoder:
+    def subsample(self, pc, num_query, input_pc_size: int):
+
+        """
+        num_query: number of points to keep after FPS
+        input_pc_size: number of points to select before FPS
+        """
+
+        B, _, D = pc.shape
+        query_ratio = num_query / input_pc_size
+
+        # random subsampling of points inside the point cloud
+        idx_pc = torch.randperm(pc.shape[1], device = pc.device)[:input_pc_size]
+        input_pc = pc[:, idx_pc, :]
+
+        # flatten to allow applying fps across the whole batch
+        flattent_input_pc = input_pc.view(B * input_pc_size, D)
+
+        # construct a batch_down tensor to tell fps
+        # which points belong to which batch
+        N_down = int(flattent_input_pc.shape[0] / B)
+        batch_down = torch.arange(B).to(pc.device)
+        batch_down = torch.repeat_interleave(batch_down, N_down)
+
+        idx_query = fps(flattent_input_pc, batch_down, sampling_ratio = query_ratio)
+        query_pc = flattent_input_pc[idx_query].view(B, -1, D)
+
+        return query_pc, input_pc, idx_pc, idx_query
+
+    def handle_features(self, features, idx_pc, input_pc_size, batch_size: int, idx_query):
+
+        B = batch_size
+
+        input_surface_features = features[:, idx_pc, :]
+        flattent_input_features = input_surface_features.view(B * input_pc_size, -1)
+        query_features = flattent_input_features[idx_query].view(B, -1,
+                                                                 flattent_input_features.shape[-1])
+
+        return input_surface_features, query_features
+
+def normalize_mesh(mesh, scale = 0.9999):
+    """Normalize mesh to fit in [-scale, scale]. Translate mesh so its center is [0,0,0]"""
+
+    bbox = mesh.bounds
+    center = (bbox[1] + bbox[0]) / 2
+
+    max_extent = (bbox[1] - bbox[0]).max()
+    mesh.apply_translation(-center)
+    mesh.apply_scale((2 * scale) / max_extent)
+
+    return mesh
+
+def sample_pointcloud(mesh, num = 200000):
+    """ Uniformly sample points from the surface of the mesh """
+
+    points, face_idx = mesh.sample(num, return_index = True)
+    normals = mesh.face_normals[face_idx]
+    return torch.from_numpy(points.astype(np.float32)), torch.from_numpy(normals.astype(np.float32))
+
+def detect_sharp_edges(mesh, threshold=0.985):
+    """Return edge indices (a, b) that lie on sharp boundaries of the mesh."""
+
+    V, F = mesh.vertices, mesh.faces
+    VN, FN = mesh.vertex_normals, mesh.face_normals
+
+    sharp_mask = np.ones(V.shape[0])
+    for i in range(3):
+        indices = F[:, i]
+        alignment = np.einsum('ij,ij->i', VN[indices], FN)
+        dot_stack = np.stack((sharp_mask[indices], alignment), axis=-1)
+        sharp_mask[indices] = np.min(dot_stack, axis=-1)
+
+    edge_a = np.concatenate([F[:, 0], F[:, 1], F[:, 2]])
+    edge_b = np.concatenate([F[:, 1], F[:, 2], F[:, 0]])
+    sharp_edges = (sharp_mask[edge_a] < threshold) & (sharp_mask[edge_b] < threshold)
+
+    return edge_a[sharp_edges], edge_b[sharp_edges]
+
+
+def sharp_sample_pointcloud(mesh, num = 16384):
+    """ Sample points preferentially from sharp edges in the mesh. """
+
+    edge_a, edge_b = detect_sharp_edges(mesh)
+    V, VN = mesh.vertices, mesh.vertex_normals
+
+    va, vb = V[edge_a], V[edge_b]
+    na, nb = VN[edge_a], VN[edge_b]
+
+    edge_lengths = np.linalg.norm(vb - va, axis=-1)
+    weights = edge_lengths / edge_lengths.sum()
+
+    indices = np.searchsorted(np.cumsum(weights), np.random.rand(num))
+    t = np.random.rand(num, 1)
+
+    samples = t * va[indices] + (1 - t) * vb[indices]
+    normals = t * na[indices] + (1 - t) * nb[indices]
+
+    return samples.astype(np.float32), normals.astype(np.float32)
+
+def load_surface_sharpedge(mesh, num_points=4096, num_sharp_points=4096, sharpedge_flag = True, device = "cuda"):
+    """Load a surface with optional sharp-edge annotations from a trimesh mesh."""
+
+    import trimesh
+
+    try:
+        mesh_full = trimesh.util.concatenate(mesh.dump())
+    except Exception:
+        mesh_full = trimesh.util.concatenate(mesh)
+
+    mesh_full = normalize_mesh(mesh_full)
+
+    faces = mesh_full.faces
+    vertices = mesh_full.vertices
+    origin_face_count = faces.shape[0]
+
+    mesh_surface = trimesh.Trimesh(vertices=vertices, faces=faces[:origin_face_count])
+    mesh_fill = trimesh.Trimesh(vertices=vertices, faces=faces[origin_face_count:])
+
+    area_surface = mesh_surface.area
+    area_fill = mesh_fill.area
+    total_area = area_surface + area_fill
+
+    sample_num = 499712 // 2
+    fill_ratio = area_fill / total_area if total_area > 0 else 0
+
+    num_fill = int(sample_num * fill_ratio)
+    num_surface = sample_num - num_fill
+
+    surf_pts, surf_normals = sample_pointcloud(mesh_surface, num_surface)
+    fill_pts, fill_normals = (torch.zeros(0, 3), torch.zeros(0, 3)) if num_fill == 0 else sample_pointcloud(mesh_fill, num_fill)
+
+    sharp_pts, sharp_normals = sharp_sample_pointcloud(mesh_surface, sample_num)
+
+    def assemble_tensor(points, normals, label=None):
+
+        data = torch.cat([points, normals], dim=1).half().to(device)
+
+        if label is not None:
+            label_tensor = torch.full((data.shape[0], 1), float(label), dtype=torch.float16).to(device)
+            data = torch.cat([data, label_tensor], dim=1)
+
+        return data
+
+    surface = assemble_tensor(torch.cat([surf_pts.to(device), fill_pts.to(device)], dim=0),
+                              torch.cat([surf_normals.to(device), fill_normals.to(device)], dim=0),
+                              label = 0 if sharpedge_flag else None)
+
+    sharp_surface = assemble_tensor(torch.from_numpy(sharp_pts), torch.from_numpy(sharp_normals),
+                                    label = 1 if sharpedge_flag else None)
+
+    rng = np.random.default_rng()
+
+    surface = surface[rng.choice(surface.shape[0], num_points, replace = False)]
+    sharp_surface = sharp_surface[rng.choice(sharp_surface.shape[0], num_sharp_points, replace = False)]
+
+    full = torch.cat([surface, sharp_surface], dim = 0).unsqueeze(0)
+
+    return full
+
+class SharpEdgeSurfaceLoader:
+    """ Load mesh surface and sharp edge samples. """
+
+    def __init__(self, num_uniform_points = 8192, num_sharp_points = 8192):
+
+        self.num_uniform_points = num_uniform_points
+        self.num_sharp_points = num_sharp_points
+        self.total_points = num_uniform_points + num_sharp_points
+
+    def __call__(self, mesh_input, device = "cuda"):
+        mesh = self._load_mesh(mesh_input)
+        return load_surface_sharpedge(mesh, self.num_uniform_points, self.num_sharp_points, device = device)
+
+    @staticmethod
+    def _load_mesh(mesh_input):
+        import trimesh
+
+        if isinstance(mesh_input, str):
+            mesh = trimesh.load(mesh_input, force="mesh", merge_primitives = True)
+        else:
+            mesh = mesh_input
+
+        if isinstance(mesh, trimesh.Scene):
+            combined = None
+            for obj in mesh.geometry.values():
+                combined = obj if combined is None else combined + obj
+            return combined
+
+        return mesh
+
+class DiagonalGaussianDistribution:
+    def __init__(self, params: torch.Tensor, feature_dim: int = -1):
+
+        # divide quant channels (8) into mean and log variance
+        self.mean, self.logvar = torch.chunk(params, 2, dim = feature_dim)
+
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.std = torch.exp(0.5 * self.logvar)
+
+    def sample(self):
+
+        eps = torch.randn_like(self.std)
+        z = self.mean + eps * self.std
+
+        return z
+
+################################################
+# Volume Decoder
+################################################
+
+class VanillaVolumeDecoder():
    @torch.no_grad()
-    def __call__(
-        self,
-        latents: torch.FloatTensor,
-        geo_decoder: Callable,
-        bounds: Union[Tuple[float], List[float], float] = 1.01,
-        num_chunks: int = 10000,
-        octree_resolution: int = None,
-        enable_pbar: bool = True,
-        **kwargs,
-    ):
-        device = latents.device
-        dtype = latents.dtype
-        batch_size = latents.shape[0]
+    def __call__(self, latents: torch.Tensor, geo_decoder: callable, octree_resolution: int, bounds = 1.01,
+                 num_chunks: int = 10_000, enable_pbar: bool = True, **kwargs):

-        # 1. generate query points
        if isinstance(bounds, float):
            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]

-        bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6])
-        xyz_samples, grid_size, length = generate_dense_grid_points(
-            bbox_min=bbox_min,
-            bbox_max=bbox_max,
-            octree_resolution=octree_resolution,
-            indexing="ij"
-        )
-        xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3)
+        bbox_min, bbox_max = torch.tensor(bounds[:3]), torch.tensor(bounds[3:])
+
+        x = torch.linspace(bbox_min[0], bbox_max[0], int(octree_resolution) + 1, dtype = torch.float32)
+        y = torch.linspace(bbox_min[1], bbox_max[1], int(octree_resolution) + 1, dtype = torch.float32)
+        z = torch.linspace(bbox_min[2], bbox_max[2], int(octree_resolution) + 1, dtype = torch.float32)
+
+        [xs, ys, zs] = torch.meshgrid(x, y, z, indexing = "ij")
+        xyz = torch.stack((xs, ys, zs), axis=-1).to(latents.device, dtype = latents.dtype).contiguous().reshape(-1, 3)
+        grid_size = [int(octree_resolution) + 1, int(octree_resolution) + 1, int(octree_resolution) + 1]

-        # 2. latents to 3d volume
        batch_logits = []
-        for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), desc="Volume Decoding",
+        for start in tqdm(range(0, xyz.shape[0], num_chunks), desc="Volume Decoding",
                          disable=not enable_pbar):
-            chunk_queries = xyz_samples[start: start + num_chunks, :]
-            chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size)
-            logits = geo_decoder(queries=chunk_queries, latents=latents)
+
+            chunk_queries = xyz[start: start + num_chunks, :]
+            chunk_queries = chunk_queries.unsqueeze(0).repeat(latents.shape[0], 1, 1)
+            logits = geo_decoder(queries = chunk_queries, latents = latents)
            batch_logits.append(logits)

-        grid_logits = torch.cat(batch_logits, dim=1)
-        grid_logits = grid_logits.view((batch_size, *grid_size)).float()
+        grid_logits = torch.cat(batch_logits, dim = 1)
+        grid_logits = grid_logits.view((latents.shape[0], *grid_size)).float()

        return grid_logits

-
 class FourierEmbedder(nn.Module):
    """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
    each feature dimension of `x[..., i]` into:
@@ -175,13 +552,11 @@ class FourierEmbedder(nn.Module):
        else:
            return x

-
 class CrossAttentionProcessor:
    def __call__(self, attn, q, k, v):
        out = comfy.ops.scaled_dot_product_attention(q, k, v)
        return out

-
 class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """
@@ -232,38 +607,41 @@ class MLP(nn.Module):
    def forward(self, x):
        return self.drop_path(self.c_proj(self.gelu(self.c_fc(x))))

-
 class QKVMultiheadCrossAttention(nn.Module):
    def __init__(
        self,
-        *,
        heads: int,
+        n_data = None,
        width=None,
        qk_norm=False,
        norm_layer=ops.LayerNorm
    ):
        super().__init__()
        self.heads = heads
+        self.n_data = n_data
        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()

-        self.attn_processor = CrossAttentionProcessor()
-
    def forward(self, q, kv):
+
        _, n_ctx, _ = q.shape
        bs, n_data, width = kv.shape
+
        attn_ch = width // self.heads // 2
        q = q.view(bs, n_ctx, self.heads, -1)
+
        kv = kv.view(bs, n_data, self.heads, -1)
        k, v = torch.split(kv, attn_ch, dim=-1)

        q = self.q_norm(q)
        k = self.k_norm(k)
-        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
-        out = self.attn_processor(self, q, k, v)
-        out = out.transpose(1, 2).reshape(bs, n_ctx, -1)
-        return out

+        q, k, v = [t.permute(0, 2, 1, 3) for t in (q, k, v)]
+        out = F.scaled_dot_product_attention(q, k, v)
+
+        out = out.transpose(1, 2).reshape(bs, n_ctx, -1)
+
+        return out

 class MultiheadCrossAttention(nn.Module):
    def __init__(
@@ -306,7 +684,6 @@ class MultiheadCrossAttention(nn.Module):
        x = self.c_proj(x)
        return x

-
 class ResidualCrossAttentionBlock(nn.Module):
    def __init__(
        self,
@@ -366,7 +743,7 @@ class QKVMultiheadAttention(nn.Module):
        q = self.q_norm(q)
        k = self.k_norm(k)

-        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
+        q, k, v = [t.permute(0, 2, 1, 3) for t in (q, k, v)]
        out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
        return out

@@ -383,8 +760,7 @@ class MultiheadAttention(nn.Module):
        drop_path_rate: float = 0.0
    ):
        super().__init__()
-        self.width = width
-        self.heads = heads
+
        self.c_qkv = ops.Linear(width, width * 3, bias=qkv_bias)
        self.c_proj = ops.Linear(width, width)
        self.attention = QKVMultiheadAttention(
@@ -491,7 +867,7 @@ class CrossAttentionDecoder(nn.Module):
        self.query_proj = ops.Linear(self.fourier_embedder.out_dim, width)
        if self.downsample_ratio != 1:
            self.latents_proj = ops.Linear(width * downsample_ratio, width)
-        if self.enable_ln_post == False:
+        if not self.enable_ln_post:
            qk_norm = False
        self.cross_attn_decoder = ResidualCrossAttentionBlock(
            width=width,
@@ -522,28 +898,44 @@ class CrossAttentionDecoder(nn.Module):

 class ShapeVAE(nn.Module):
    def __init__(
-        self,
-        *,
-        embed_dim: int,
-        width: int,
-        heads: int,
-        num_decoder_layers: int,
-        geo_decoder_downsample_ratio: int = 1,
-        geo_decoder_mlp_expand_ratio: int = 4,
-        geo_decoder_ln_post: bool = True,
-        num_freqs: int = 8,
-        include_pi: bool = True,
-        qkv_bias: bool = True,
-        qk_norm: bool = False,
-        label_type: str = "binary",
-        drop_path_rate: float = 0.0,
-        scale_factor: float = 1.0,
+            self,
+            *,
+            num_latents: int = 4096,
+            embed_dim: int = 64,
+            width: int = 1024,
+            heads: int = 16,
+            num_decoder_layers: int = 16,
+            num_encoder_layers: int = 8,
+            pc_size: int = 81920,
+            pc_sharpedge_size: int = 0,
+            point_feats: int = 4,
+            downsample_ratio: int = 20,
+            geo_decoder_downsample_ratio: int = 1,
+            geo_decoder_mlp_expand_ratio: int = 4,
+            geo_decoder_ln_post: bool = True,
+            num_freqs: int = 8,
+            qkv_bias: bool = False,
+            qk_norm: bool = True,
+            drop_path_rate: float = 0.0,
+            include_pi: bool = False,
+            scale_factor: float = 1.0039506158752403,
+            label_type: str = "binary",
    ):
        super().__init__()
        self.geo_decoder_ln_post = geo_decoder_ln_post

        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)

+        self.encoder = PointCrossAttention(layers = num_encoder_layers,
+                                    num_latents = num_latents,
+                                    downsample_ratio = downsample_ratio,
+                                    heads = heads,
+                                    pc_size = pc_size,
+                                    width = width,
+                                    point_feats = point_feats,
+                                    fourier_embedder = self.fourier_embedder,
+                                    pc_sharpedge_size = pc_sharpedge_size)
+
        self.post_kl = ops.Linear(embed_dim, width)

        self.transformer = Transformer(
@@ -583,5 +975,14 @@ class ShapeVAE(nn.Module):
        grid_logits = self.volume_decoder(latents, self.geo_decoder, bounds=bounds, num_chunks=num_chunks, octree_resolution=octree_resolution, enable_pbar=enable_pbar)
        return grid_logits.movedim(-2, -1)

-    def encode(self, x):
-        return None
+    def encode(self, surface):
+
+        pc, feats = surface[:, :, :3], surface[:, :, 3:]
+        latents = self.encoder(pc, feats)
+
+        moments = self.pre_kl(latents)
+        posterior = DiagonalGaussianDistribution(moments, feature_dim = -1)
+
+        latents = posterior.sample()
+
+        return latents