Spaces:

xlr8harder
/

tahm_kench

Sleeping

App Files Files Community

xlr8 commited on Apr 25

Commit

0be2076

1 Parent(s): b50cb0b

try again

Browse files

Files changed (1) hide show

models.py +36 -62

models.py CHANGED Viewed

@@ -2,6 +2,7 @@ from dataclasses import dataclass
 import torch
 import torch.nn as nn
 import torchtune
 from huggingface_hub import PyTorchModelHubMixin
 from torchtune.models import llama3_2
@@ -67,35 +68,32 @@ def sample_topk_topp(
     temperature: float,
 ) -> torch.Tensor:
     """
-    Returns a tensor of shape (batch_size, 1) of sampled token indices,
-    applying first top-k, then nucleus (top-p), then multinomial sampling.
     """
-    # scale and softmax
     scaled = logits / temperature
-    probs = torch.softmax(scaled, dim=-1)
-    # apply top-k mask
     if topk < probs.size(-1):
         topk_vals, topk_idx = torch.topk(probs, topk, dim=-1)
         mask = torch.zeros_like(probs)
         mask.scatter_(-1, topk_idx, topk_vals)
         probs = mask
-    # apply top-p (nucleus)
     sorted_probs, sorted_idx = torch.sort(probs, descending=True, dim=-1)
     cumulative = torch.cumsum(sorted_probs, dim=-1)
-    keep_mask = cumulative <= top_p
-    keep_mask[..., 0] = True  # always keep the top token
     probs_final = torch.zeros_like(probs)
-    probs_final.scatter_(-1, sorted_idx, sorted_probs * keep_mask.float())
-    # renormalize
     probs_final = probs_final / probs_final.sum(dim=-1, keepdim=True)
-    # sample once per batch, keep that extra dim!
-    sample = torch.multinomial(probs_final, num_samples=1)  # (batch_size, 1)
-    return sample
 @dataclass
@@ -118,9 +116,9 @@ class Model(
         super().__init__()
         self.config = config
-        # backbone (text+audio embedding)
         self.backbone, backbone_dim = _prepare_transformer(FLAVORS[config.backbone_flavor]())
-        # decoder (only audio codebooks)
         self.decoder, decoder_dim = _prepare_transformer(FLAVORS[config.decoder_flavor]())
         self.text_embeddings = nn.Embedding(config.text_vocab_size, backbone_dim)
@@ -155,59 +153,46 @@ class Model(
     @torch.inference_mode()
     def generate_frame(
         self,
-        tokens: torch.Tensor,
-        tokens_mask: torch.Tensor,
-        input_pos: torch.Tensor,
         temperature: float,
         topk: int,
         top_p: float,
     ) -> torch.Tensor:
-        """
-        tokens: (batch, seq, codebooks+1)
-        tokens_mask: (batch, seq, codebooks+1)
-        input_pos: (batch, seq)
-        Returns:
-            Tensor of shape (batch, codebooks) containing one new token per codebook.
-        """
         dtype = next(self.parameters()).dtype
-        assert self.backbone.caches_are_enabled(), "backbone caches are not enabled"
-        # build backbone mask from causal mask + positions
-        bb_mask = _index_causal_mask(self.backbone_causal_mask, input_pos)
-        # embed and encode
         embeds = self._embed_tokens(tokens)
-        h = self.backbone(
-            (embeds * tokens_mask.unsqueeze(-1)).sum(dim=2),
-            input_pos=input_pos,
-            mask=bb_mask,
-        ).to(dtype=dtype)
-        # Take last hidden state
         last_h = h[:, -1, :]               # (batch, hidden)
-        last_h_unsq = last_h.unsqueeze(1)  # (batch, 1, hidden)
-        # ==== CODEBOOK 0 ====
-        c0_logits = self.codebook0_head(last_h)          # (batch, vocab)
         c0_sample = sample_topk_topp(c0_logits, topk, top_p, temperature)  # (batch,1)
-        c0_embed = self._embed_audio(0, c0_sample.squeeze(-1)).unsqueeze(1) # (batch,1,hidden)
         # Prepare for decoder
         curr_h = torch.cat([last_h_unsq, c0_embed], dim=1)  # (batch,2,hidden)
         curr_sample = c0_sample.clone()                     # (batch,1)
         curr_pos = torch.arange(0, curr_h.size(1)).unsqueeze(0).to(tokens.device).long()  # (1,2)
-        # ==== Remaining codebooks ====
         self.decoder.reset_caches()
         for i in range(1, self.config.audio_num_codebooks):
-            dec_mask = _index_causal_mask(self.decoder_causal_mask, curr_pos)
-            dec_h = self.decoder(self.projection(curr_h), input_pos=curr_pos, mask=dec_mask).to(dtype=dtype)
             ci_logits = torch.mm(dec_h[:, -1, :], self.audio_head[i - 1])  # (batch, vocab)
             ci_sample = sample_topk_topp(ci_logits, topk, top_p, temperature)  # (batch,1)
             ci_embed = self._embed_audio(i, ci_sample.squeeze(-1)).unsqueeze(1)  # (batch,1,hidden)
             curr_h = ci_embed
-            curr_sample = torch.cat([curr_sample, ci_sample], dim=1)  # (batch, i+1)
             curr_pos = curr_pos[:, -1:] + 1
         return curr_sample  # (batch, audio_num_codebooks)
@@ -218,7 +203,7 @@ class Model(
     def _embed_audio(self, codebook: int, tokens: torch.Tensor) -> torch.Tensor:
         """
-        tokens: (batch,) of token IDs for this codebook
         returns: (batch, hidden)
         """
         ids = tokens + codebook * self.config.audio_vocab_size
@@ -229,26 +214,15 @@ class Model(
         tokens: (batch, seq, codebooks+1)
         returns: (batch, seq, codebooks+1, hidden)
         """
-        # text part (last index of 33)
         text_ids = tokens[:, :, -1]
-        text_emb = self.text_embeddings(text_ids).unsqueeze(-2)  # (batch, seq, 1, hidden)
-        # audio codebooks
         audio_ids = tokens[:, :, :-1] + (
             self.config.audio_vocab_size * torch.arange(self.config.audio_num_codebooks, device=tokens.device)
-        )  # (batch, seq, codebooks)
         audio_emb = (
-            self.audio_embeddings(audio_ids.reshape(-1)).reshape(
-                tokens.size(0), tokens.size(1), self.config.audio_num_codebooks, -1
-            )
-        )  # (batch, seq, codebooks, hidden)
-        return torch.cat([audio_emb, text_emb], dim=2)  # (batch, seq, codebooks+1, hidden)
-    @classmethod
-    def from_pretrained(cls, repo_id: str):
-        # load args & state from HF repo, e.g. sesame/csm-1b or your fine-tuned xlr8harder model
-        config = cls._load_config(repo_id)  # uses PyTorchModelHubMixin behind the scenes
-        model = cls(config)
-        model.load_state_dict(model._load_state_dict(repo_id), strict=False)
-        return model

 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import torchtune
 from huggingface_hub import PyTorchModelHubMixin
 from torchtune.models import llama3_2
     temperature: float,
 ) -> torch.Tensor:
     """
+    Apply top-k, then nucleus (top-p), then sample.
+    Returns a tensor of shape (batch_size, 1).
     """
     scaled = logits / temperature
+    probs = F.softmax(scaled, dim=-1)
+    # Top-k
     if topk < probs.size(-1):
         topk_vals, topk_idx = torch.topk(probs, topk, dim=-1)
         mask = torch.zeros_like(probs)
         mask.scatter_(-1, topk_idx, topk_vals)
         probs = mask
+    # Nucleus (top-p)
     sorted_probs, sorted_idx = torch.sort(probs, descending=True, dim=-1)
     cumulative = torch.cumsum(sorted_probs, dim=-1)
+    keep = cumulative <= top_p
+    keep[..., 0] = True  # always keep top token
     probs_final = torch.zeros_like(probs)
+    probs_final.scatter_(-1, sorted_idx, sorted_probs * keep.float())
     probs_final = probs_final / probs_final.sum(dim=-1, keepdim=True)
+    # sample once per batch, keep that extra dim
+    return torch.multinomial(probs_final, num_samples=1)  # (batch, 1)
 @dataclass
         super().__init__()
         self.config = config
+        # Text + audio backbone
         self.backbone, backbone_dim = _prepare_transformer(FLAVORS[config.backbone_flavor]())
+        # Audio decoder
         self.decoder, decoder_dim = _prepare_transformer(FLAVORS[config.decoder_flavor]())
         self.text_embeddings = nn.Embedding(config.text_vocab_size, backbone_dim)
     @torch.inference_mode()
     def generate_frame(
         self,
+        tokens: torch.Tensor,           # (batch, seq, codebooks+1)
+        tokens_mask: torch.Tensor,      # (batch, seq, codebooks+1)
+        input_pos: torch.Tensor,        # (batch, seq)
         temperature: float,
         topk: int,
         top_p: float,
     ) -> torch.Tensor:
         dtype = next(self.parameters()).dtype
+        # Backbone forward
+        mask_bb = _index_causal_mask(self.backbone_causal_mask, input_pos)
         embeds = self._embed_tokens(tokens)
+        h = self.backbone((embeds * tokens_mask.unsqueeze(-1)).sum(dim=2), input_pos=input_pos, mask=mask_bb).to(dtype=dtype)
+        # Last hidden
         last_h = h[:, -1, :]               # (batch, hidden)
+        last_h_unsq = last_h.unsqueeze(1)  # (batch,1,hidden)
+        # Codebook 0
+        c0_logits = self.codebook0_head(last_h)              # (batch, vocab)
         c0_sample = sample_topk_topp(c0_logits, topk, top_p, temperature)  # (batch,1)
+        c0_embed = self._embed_audio(0, c0_sample.squeeze(-1)).unsqueeze(1)  # (batch,1,hidden)
         # Prepare for decoder
         curr_h = torch.cat([last_h_unsq, c0_embed], dim=1)  # (batch,2,hidden)
         curr_sample = c0_sample.clone()                     # (batch,1)
         curr_pos = torch.arange(0, curr_h.size(1)).unsqueeze(0).to(tokens.device).long()  # (1,2)
+        # Remaining codebooks
         self.decoder.reset_caches()
         for i in range(1, self.config.audio_num_codebooks):
+            mask_dec = _index_causal_mask(self.decoder_causal_mask, curr_pos)
+            dec_h = self.decoder(self.projection(curr_h), input_pos=curr_pos, mask=mask_dec).to(dtype=dtype)
             ci_logits = torch.mm(dec_h[:, -1, :], self.audio_head[i - 1])  # (batch, vocab)
             ci_sample = sample_topk_topp(ci_logits, topk, top_p, temperature)  # (batch,1)
             ci_embed = self._embed_audio(i, ci_sample.squeeze(-1)).unsqueeze(1)  # (batch,1,hidden)
             curr_h = ci_embed
+            curr_sample = torch.cat([curr_sample, ci_sample], dim=1)  # (batch,i+1)
             curr_pos = curr_pos[:, -1:] + 1
         return curr_sample  # (batch, audio_num_codebooks)
     def _embed_audio(self, codebook: int, tokens: torch.Tensor) -> torch.Tensor:
         """
+        tokens: (batch,) token IDs for this codebook
         returns: (batch, hidden)
         """
         ids = tokens + codebook * self.config.audio_vocab_size
         tokens: (batch, seq, codebooks+1)
         returns: (batch, seq, codebooks+1, hidden)
         """
         text_ids = tokens[:, :, -1]
+        text_emb = self.text_embeddings(text_ids).unsqueeze(-2)
         audio_ids = tokens[:, :, :-1] + (
             self.config.audio_vocab_size * torch.arange(self.config.audio_num_codebooks, device=tokens.device)
+        )
         audio_emb = (
+            self.audio_embeddings(audio_ids.reshape(-1))
+            .reshape(tokens.size(0), tokens.size(1), self.config.audio_num_codebooks, -1)
+        )
+        return torch.cat([audio_emb, text_emb], dim=2)