dev-mode-orpheus-tts

Paused

App Files Files Community

Tomtom84 commited on Apr 21

Commit

a0cc672

verified ·

1 Parent(s): 5d55203

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -38

app.py CHANGED Viewed

@@ -35,28 +35,11 @@ class AudioMask(LogitsProcessor):
         self.sent_blocks = 0
         self.buffer_pos = 0 # Added buffer position
-    def __call__(self, input_ids, logits):
-        # Calculate allowed tokens based on buffer position
-        start_token = AUDIO_BASE + self.buffer_pos * 4096
-        end_token = start_token + 4096
-        allowed_audio = torch.arange(start_token, end_token, device=self.allow.device)
-        # Only allow NEW_BLOCK if buffer is full, otherwise only allow audio tokens
-        if self.buffer_pos == 7:
-            allowed = torch.cat([
-                torch.tensor([NEW_BLOCK], device=self.allow.device),
-                allowed_audio
-            ])
-        else:
-            allowed = allowed_audio # Only allow audio tokens
-        if self.sent_blocks:                        # ab 1. Block EOS zulassen
-            allowed = torch.cat([allowed, self.eos])
-        mask = logits.new_full(logits.shape, float("-inf"))
-        mask = logits.new_full(logits.shape, float("-inf"))
-        mask[:, allowed] = 0
-        return logits + mask
 # 3) FastAPI Grundgerüst ---------------------------------------------
 app = FastAPI()
@@ -94,7 +77,7 @@ def build_prompt(text: str, voice: str):
 def decode_block(block7: list[int]) -> bytes:
     l1,l2,l3=[],[],[]
-    l1.append(block7[0] - (0 * 4096)) # Subtract AUDIO_BASE + position 0 offset
     l2.append(block7[1] + (1 * 4096)) # Subtract AUDIO_BASE + position 1 offset
     l3 += [block7[2] + (2 * 4096), block7[3] + (3 * 4096)] # Subtract AUDIO_BASE + position offsets
     l2.append(block7[4] + (4 * 4096)) # Subtract AUDIO_BASE + position 4 offset
@@ -120,13 +103,11 @@ async def tts(ws: WebSocket):
         past       = None
         offset_len = ids.size(1)          # wie viele Tokens existieren schon
         last_tok   = None
-        buf        = []
         # masker.buffer_pos = 0 # Removed initialization here
         while True:
-            # Update buffer_pos based on current buffer length before generation
-            masker.buffer_pos = len(buf)
             # --- Mini‑Generate (Cache Disabled for Debugging) -------------------------------------------
             gen = model.generate(
                 input_ids      = ids, # Always use full sequence
@@ -164,17 +145,14 @@ async def tts(ws: WebSocket):
                     continue
                 # Only append if it's an audio token
                 # Only append if it's an audio token
-                if t >= AUDIO_BASE and t < AUDIO_BASE + AUDIO_SPAN:
-                    buf.append(t - AUDIO_BASE) # Append token relative to AUDIO_BASE
-                    # masker.buffer_pos += 1 # Removed increment here
-                    if len(buf) == 7:
-                        await ws.send_bytes(decode_block(buf))
-                        buf.clear()
-                        masker.sent_blocks = 1      # ab jetzt EOS zulässig
-                        # masker.buffer_pos = 0 # Removed reset here
-                else:
-                    # Optional: Log unexpected tokens
-                    print(f"DEBUG: Skipping non-audio token: {t}", flush=True)
     except (StopIteration, WebSocketDisconnect):
         pass

         self.sent_blocks = 0
         self.buffer_pos = 0 # Added buffer position
+    def __call__(self, input_ids, scores):
+        allow = torch.cat([self.allow, self.eos]) # Reverted masking logic
+        mask = torch.full_like(scores, float("-inf"))
+        mask[:, allow] = 0
+        return scores + mask
 # 3) FastAPI Grundgerüst ---------------------------------------------
 app = FastAPI()
 def decode_block(block7: list[int]) -> bytes:
     l1,l2,l3=[],[],[]
+    l1.append(block7[0] + (0 * 4096)) # Subtract AUDIO_BASE + position 0 offset
     l2.append(block7[1] + (1 * 4096)) # Subtract AUDIO_BASE + position 1 offset
     l3 += [block7[2] + (2 * 4096), block7[3] + (3 * 4096)] # Subtract AUDIO_BASE + position offsets
     l2.append(block7[4] + (4 * 4096)) # Subtract AUDIO_BASE + position 4 offset
         past       = None
         offset_len = ids.size(1)          # wie viele Tokens existieren schon
         last_tok   = None
+        buf         = []
         # masker.buffer_pos = 0 # Removed initialization here
+        # Removed buffer_pos update before generation
         while True:
             # --- Mini‑Generate (Cache Disabled for Debugging) -------------------------------------------
             gen = model.generate(
                 input_ids      = ids, # Always use full sequence
                     continue
                 # Only append if it's an audio token
                 # Only append if it's an audio token
+                buf.append(t - AUDIO_BASE) # Reverted to appending relative token
+                # masker.buffer_pos += 1 # Removed increment here
+                if len(buf) == 7:
+                    await ws.send_bytes(decode_block(buf))
+                    buf.clear()
+                    masker.sent_blocks = 1      # ab jetzt EOS zulässig
+                    # masker.buffer_pos = 0 # Removed reset here
+                # Removed else block for skipping non-audio tokens
     except (StopIteration, WebSocketDisconnect):
         pass