Spaces:

xlr8harder
/

tahm_kench

Sleeping

xlr8 commited on Apr 25

Commit

be19290

1 Parent(s): da443d2

bugfix in top_p sampling when top_p tokens < top_k

Files changed (1) hide show

models.py CHANGED Viewed

@@ -63,28 +63,39 @@ def _multinomial_sample_one_no_sync(probs):
     q = torch.empty_like(probs).exponential_(1)
     return torch.argmax(probs / q, dim=-1, keepdim=True).to(dtype=torch.int)
-def sample_topk_topp(logits: torch.Tensor, topk: int, top_p: float, temperature: float):
     logits = logits / temperature
-    logits = torch.nn.functional.log_softmax(logits, dim=-1)
-    probs = torch.nn.functional.softmax(logits, dim=-1)
-    sorted_probs, sorted_indices = torch.sort(probs, descending=True)
     cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
-    if top_p < 1.0:
-        sorted_mask = cumulative_probs > top_p
-        sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
-        sorted_mask[..., 0] = 0
-        probs[sorted_indices[sorted_mask]] = 0.0
-    if topk < probs.shape[-1]:
-        topk_thresh = torch.topk(probs, topk)[0][..., -1, None]
-        probs = torch.where(probs < topk_thresh, 0.0, probs)
-    probs = probs / probs.sum(dim=-1, keepdim=True)
-    return _multinomial_sample_one_no_sync(probs)
 @dataclass
 class ModelArgs:

     q = torch.empty_like(probs).exponential_(1)
     return torch.argmax(probs / q, dim=-1, keepdim=True).to(dtype=torch.int)
+def sample_topk_topp(logits, topk=50, topp=0.9, temperature=1.0):
+    if temperature <= 0:
+        raise ValueError("Temperature must be > 0")
     logits = logits / temperature
+    probs = torch.softmax(logits, dim=-1)
+    # Clamp topk to not exceed the vocab size
+    vocab_size = probs.shape[-1]
+    topk = min(topk, vocab_size)
+    # Get topk indices and probabilities
+    topk_probs, topk_indices = torch.topk(probs, topk, dim=-1)
+    # Compute cumulative probabilities for nucleus sampling
+    sorted_probs, sorted_indices = torch.sort(topk_probs, descending=True, dim=-1)
     cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+    # Mask out tokens beyond topp threshold
+    topp_mask = cumulative_probs <= topp
+    # Always keep at least one token
+    topp_mask[..., 0] = True
+    # Apply mask and renormalize
+    masked_probs = sorted_probs * topp_mask
+    masked_probs = masked_probs / masked_probs.sum(dim=-1, keepdim=True)
+    # Sample from masked distribution
+    sample_idx = torch.multinomial(masked_probs, num_samples=1)
+    # Map back to the original vocab indices
+    chosen_index = sorted_indices.gather(-1, sample_idx).squeeze(-1)
+    return chosen_index
 @dataclass
 class ModelArgs: