Spaces:

xlr8harder
/

tahm_kench

Sleeping

App Files Files Community

xlr8 commited on Apr 25

Commit

5a36a74

1 Parent(s): 0be2076

and again

Browse files

Files changed (1) hide show

models.py +32 -29

models.py CHANGED Viewed

@@ -71,29 +71,36 @@ def sample_topk_topp(
     Apply top-k, then nucleus (top-p), then sample.
     Returns a tensor of shape (batch_size, 1).
     """
     scaled = logits / temperature
     probs = F.softmax(scaled, dim=-1)
-    # Top-k
     if topk < probs.size(-1):
         topk_vals, topk_idx = torch.topk(probs, topk, dim=-1)
-        mask = torch.zeros_like(probs)
-        mask.scatter_(-1, topk_idx, topk_vals)
-        probs = mask
-    # Nucleus (top-p)
     sorted_probs, sorted_idx = torch.sort(probs, descending=True, dim=-1)
     cumulative = torch.cumsum(sorted_probs, dim=-1)
     keep = cumulative <= top_p
-    keep[..., 0] = True  # always keep top token
     probs_final = torch.zeros_like(probs)
-    probs_final.scatter_(-1, sorted_idx, sorted_probs * keep.float())
     probs_final = probs_final / probs_final.sum(dim=-1, keepdim=True)
     # sample once per batch, keep that extra dim
-    return torch.multinomial(probs_final, num_samples=1)  # (batch, 1)
 @dataclass
@@ -116,7 +123,7 @@ class Model(
         super().__init__()
         self.config = config
-        # Text + audio backbone
         self.backbone, backbone_dim = _prepare_transformer(FLAVORS[config.backbone_flavor]())
         # Audio decoder
         self.decoder, decoder_dim = _prepare_transformer(FLAVORS[config.decoder_flavor]())
@@ -162,32 +169,36 @@ class Model(
     ) -> torch.Tensor:
         dtype = next(self.parameters()).dtype
-        # Backbone forward
-        mask_bb = _index_causal_mask(self.backbone_causal_mask, input_pos)
         embeds = self._embed_tokens(tokens)
-        h = self.backbone((embeds * tokens_mask.unsqueeze(-1)).sum(dim=2), input_pos=input_pos, mask=mask_bb).to(dtype=dtype)
-        # Last hidden
         last_h = h[:, -1, :]               # (batch, hidden)
         last_h_unsq = last_h.unsqueeze(1)  # (batch,1,hidden)
-        # Codebook 0
-        c0_logits = self.codebook0_head(last_h)              # (batch, vocab)
         c0_sample = sample_topk_topp(c0_logits, topk, top_p, temperature)  # (batch,1)
         c0_embed = self._embed_audio(0, c0_sample.squeeze(-1)).unsqueeze(1)  # (batch,1,hidden)
-        # Prepare for decoder
         curr_h = torch.cat([last_h_unsq, c0_embed], dim=1)  # (batch,2,hidden)
         curr_sample = c0_sample.clone()                     # (batch,1)
-        curr_pos = torch.arange(0, curr_h.size(1)).unsqueeze(0).to(tokens.device).long()  # (1,2)
-        # Remaining codebooks
         self.decoder.reset_caches()
         for i in range(1, self.config.audio_num_codebooks):
-            mask_dec = _index_causal_mask(self.decoder_causal_mask, curr_pos)
-            dec_h = self.decoder(self.projection(curr_h), input_pos=curr_pos, mask=mask_dec).to(dtype=dtype)
-            ci_logits = torch.mm(dec_h[:, -1, :], self.audio_head[i - 1])  # (batch, vocab)
             ci_sample = sample_topk_topp(ci_logits, topk, top_p, temperature)  # (batch,1)
             ci_embed = self._embed_audio(i, ci_sample.squeeze(-1)).unsqueeze(1)  # (batch,1,hidden)
@@ -202,18 +213,10 @@ class Model(
         self.decoder.reset_caches()
     def _embed_audio(self, codebook: int, tokens: torch.Tensor) -> torch.Tensor:
-        """
-        tokens: (batch,) token IDs for this codebook
-        returns: (batch, hidden)
-        """
         ids = tokens + codebook * self.config.audio_vocab_size
         return self.audio_embeddings(ids)
     def _embed_tokens(self, tokens: torch.Tensor) -> torch.Tensor:
-        """
-        tokens: (batch, seq, codebooks+1)
-        returns: (batch, seq, codebooks+1, hidden)
-        """
         text_ids = tokens[:, :, -1]
         text_emb = self.text_embeddings(text_ids).unsqueeze(-2)
         audio_ids = tokens[:, :, :-1] + (

     Apply top-k, then nucleus (top-p), then sample.
     Returns a tensor of shape (batch_size, 1).
     """
+    # scale + softmax
     scaled = logits / temperature
     probs = F.softmax(scaled, dim=-1)
+    # --- top-k ---
     if topk < probs.size(-1):
         topk_vals, topk_idx = torch.topk(probs, topk, dim=-1)
+        mask_k = torch.zeros_like(probs)
+        mask_k.scatter_(-1, topk_idx, topk_vals)
+        probs = mask_k
+    # --- top-p (nucleus) ---
     sorted_probs, sorted_idx = torch.sort(probs, descending=True, dim=-1)
     cumulative = torch.cumsum(sorted_probs, dim=-1)
     keep = cumulative <= top_p
+    keep[..., 0] = True  # always keep highest-prob
+    # cast mask to same dtype as sorted_probs
+    keep = keep.to(sorted_probs.dtype)
+    # build final probabilities in correct dtype
     probs_final = torch.zeros_like(probs)
+    src = sorted_probs * keep  # same dtype
+    probs_final.scatter_(-1, sorted_idx, src)
+    # renormalize
     probs_final = probs_final / probs_final.sum(dim=-1, keepdim=True)
     # sample once per batch, keep that extra dim
+    return torch.multinomial(probs_final, num_samples=1)  # shape (batch,1)
 @dataclass
         super().__init__()
         self.config = config
+        # Text+audio backbone
         self.backbone, backbone_dim = _prepare_transformer(FLAVORS[config.backbone_flavor]())
         # Audio decoder
         self.decoder, decoder_dim = _prepare_transformer(FLAVORS[config.decoder_flavor]())
     ) -> torch.Tensor:
         dtype = next(self.parameters()).dtype
+        # Backbone pass
+        bb_mask = _index_causal_mask(self.backbone_causal_mask, input_pos)
         embeds = self._embed_tokens(tokens)
+        h = self.backbone(
+            (embeds * tokens_mask.unsqueeze(-1)).sum(dim=2),
+            input_pos=input_pos,
+            mask=bb_mask,
+        ).to(dtype=dtype)
+        # Last hidden state
         last_h = h[:, -1, :]               # (batch, hidden)
         last_h_unsq = last_h.unsqueeze(1)  # (batch,1,hidden)
+        # --- codebook 0 ---
+        c0_logits = self.codebook0_head(last_h)  # (batch, vocab)
         c0_sample = sample_topk_topp(c0_logits, topk, top_p, temperature)  # (batch,1)
         c0_embed = self._embed_audio(0, c0_sample.squeeze(-1)).unsqueeze(1)  # (batch,1,hidden)
+        # Prepare decoder input
         curr_h = torch.cat([last_h_unsq, c0_embed], dim=1)  # (batch,2,hidden)
         curr_sample = c0_sample.clone()                     # (batch,1)
+        curr_pos = torch.arange(0, curr_h.size(1)).unsqueeze(0).to(tokens.device).long()
+        # --- remaining codebooks ---
         self.decoder.reset_caches()
         for i in range(1, self.config.audio_num_codebooks):
+            dec_mask = _index_causal_mask(self.decoder_causal_mask, curr_pos)
+            dec_h = self.decoder(self.projection(curr_h), input_pos=curr_pos, mask=dec_mask).to(dtype=dtype)
+            ci_logits = torch.mm(dec_h[:, -1, :], self.audio_head[i - 1])
             ci_sample = sample_topk_topp(ci_logits, topk, top_p, temperature)  # (batch,1)
             ci_embed = self._embed_audio(i, ci_sample.squeeze(-1)).unsqueeze(1)  # (batch,1,hidden)
         self.decoder.reset_caches()
     def _embed_audio(self, codebook: int, tokens: torch.Tensor) -> torch.Tensor:
         ids = tokens + codebook * self.config.audio_vocab_size
         return self.audio_embeddings(ids)
     def _embed_tokens(self, tokens: torch.Tensor) -> torch.Tensor:
         text_ids = tokens[:, :, -1]
         text_emb = self.text_embeddings(text_ids).unsqueeze(-2)
         audio_ids = tokens[:, :, :-1] + (