dev-mode-realtts-orpheus

Paused

App Files Files Community

Tomtom84 commited on Apr 21

Commit

53012c3

verified ·

1 Parent(s): 55145d2

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -146

app.py CHANGED Viewed

@@ -36,6 +36,7 @@ NEW_BLOCK = 128257
 EOS_TOKEN = 128258
 AUDIO_BASE = 128266
 AUDIO_SPAN = 4096 * 7  # 28672 Codes
 # Create AUDIO_IDS on the correct device later in load_models
 AUDIO_IDS_CPU = torch.arange(AUDIO_BASE, AUDIO_BASE + AUDIO_SPAN)
@@ -45,113 +46,117 @@ class AudioMask(LogitsProcessor):
         super().__init__()
         # Allow NEW_BLOCK and all valid audio tokens initially
         self.allow = torch.cat([
-            torch.tensor([new_block_token_id], device=audio_ids.device), # Add NEW_BLOCK token ID
             audio_ids
         ], dim=0)
-        self.eos = torch.tensor([eos_token_id], device=audio_ids.device) # Store EOS token ID as tensor
-        self.allow_with_eos = torch.cat([self.allow, self.eos], dim=0) # Precompute combined tensor
         self.sent_blocks = 0 # State: Number of audio blocks sent
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        # Determine which tokens are allowed based on whether blocks have been sent
         current_allow = self.allow_with_eos if self.sent_blocks > 0 else self.allow
-        # Create a mask initialized to negative infinity
         mask = torch.full_like(scores, float("-inf"))
-        # Set allowed token scores to 0 (effectively allowing them)
         mask[:, current_allow] = 0
-        # Apply the mask to the scores
         return scores + mask
     def reset(self):
-        """Resets the state for a new generation request."""
         self.sent_blocks = 0
 # 3) StoppingCriteria für EOS ---------------------------------------
-# generate() needs explicit stopping criteria when using a streamer
 class EosStoppingCriteria(StoppingCriteria):
     def __init__(self, eos_token_id: int):
         self.eos_token_id = eos_token_id
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        # Check if the *last* generated token is the EOS token
         if input_ids.shape[1] > 0 and input_ids[:, -1] == self.eos_token_id:
-            # print("StoppingCriteria: EOS detected.")
             return True
         return False
 # 4) Benutzerdefinierter AudioStreamer -------------------------------
 class AudioStreamer(BaseStreamer):
-    # --- Updated __init__ to accept target_device ---
     def __init__(self, ws: WebSocket, snac_decoder: SNAC, audio_mask: AudioMask, loop: asyncio.AbstractEventLoop, target_device: str):
         self.ws = ws
         self.snac = snac_decoder
-        self.masker = audio_mask # Reference to the mask to update sent_blocks
-        self.loop = loop # Event loop of the main thread for run_coroutine_threadsafe
-        # --- Use the passed target_device ---
         self.device = target_device
-        self.buf: list[int] = [] # Buffer for audio token values (AUDIO_BASE subtracted)
-        self.tasks = set() # Keep track of pending send tasks
     def _decode_block(self, block7: list[int]) -> bytes:
         """
         Decodes a block of 7 audio token values (AUDIO_BASE subtracted) into audio bytes.
-        NOTE: The mapping from the 7 tokens to the 3 SNAC codebooks (l1, l2, l3)
-              is based on the structure found in the previous while-loop version.
-              If audio is distorted, this mapping is the primary suspect.
-              Ensure this mapping is correct for the specific model!
         """
         if len(block7) != 7:
             print(f"Streamer Warning: _decode_block received {len(block7)} tokens, expected 7. Skipping.")
-            return b"" # Return empty bytes if block is incomplete
-        # --- Mapping derived from previous user version (indices [0], [1,4], [2,3,5,6]) ---
-        # This seems more likely to be correct for Kartoffel_Orpheus if the previous version worked.
         try:
-            l1 = [block7[0]]                         # Index 0
-            l2 = [block7[1], block7[4]]             # Indices 1, 4
-            l3 = [block7[2], block7[3], block7[5], block7[6]] # Indices 2, 3, 5, 6
         except IndexError:
             print(f"Streamer Error: Index out of bounds during token mapping. Block: {block7}")
             return b""
-        # --- Alternative Hypothesis (commented out): Interleaving mapping ---
-        # try:
-        #     l1 = [block7[0], block7[3], block7[6]] # Codebook 1 indices: 0, 3, 6
-        #     l2 = [block7[1], block7[4]]          # Codebook 2 indices: 1, 4
-        #     l3 = [block7[2], block7[5]]          # Codebook 3 indices: 2, 5
-        # except IndexError:
-        #     print(f"Streamer Error: Index out of bounds during token mapping. Block: {block7}")
-        #     return b""
-        # --- End Alternative Hypothesis ---
-        # Convert lists to tensors on the correct device
-        # Use self.device which was set correctly in __init__
-        codes_l1 = torch.tensor(l1, dtype=torch.long, device=self.device).unsqueeze(0)
-        codes_l2 = torch.tensor(l2, dtype=torch.long, device=self.device).unsqueeze(0)
-        codes_l3 = torch.tensor(l3, dtype=torch.long, device=self.device).unsqueeze(0)
-        codes = [codes_l1, codes_l2, codes_l3] # List of tensors for SNAC
-        # Decode using SNAC
-        with torch.no_grad():
-            # self.snac should already be on self.device from load_models_startup
-            audio = self.snac.decode(codes)[0] # Decode expects list of tensors, result might have batch dim
-        # Squeeze, move to CPU, convert to numpy
-        audio_np = audio.squeeze().detach().cpu().numpy()
-        # Convert to 16-bit PCM bytes
-        audio_bytes = (audio_np * 32767).astype("int16").tobytes()
-        return audio_bytes
     async def _send_audio_bytes(self, data: bytes):
         """Coroutine to send bytes over WebSocket."""
-        if not data: # Don't send empty bytes
             return
         try:
             await self.ws.send_bytes(data)
-            # print(f"Streamer: Sent {len(data)} audio bytes.")
         except WebSocketDisconnect:
             print("Streamer: WebSocket disconnected during send.")
         except Exception as e:
@@ -159,67 +164,43 @@ class AudioStreamer(BaseStreamer):
     def put(self, value: torch.LongTensor):
         """
-        Receives new token IDs (Tensor) from generate() (runs in worker thread).
-        Processes tokens, decodes full blocks, and schedules sending via run_coroutine_threadsafe.
         """
-        # Ensure value is on CPU and flatten to a list of ints
         if value.numel() == 0:
             return
         new_token_ids = value.squeeze().tolist()
-        if isinstance(new_token_ids, int): # Handle single token case
             new_token_ids = [new_token_ids]
         for t in new_token_ids:
             if t == EOS_TOKEN:
-                # print("Streamer: EOS token encountered.")
-                # EOS is handled by StoppingCriteria, no action needed here except maybe logging.
-                break # Stop processing this batch if EOS is found
             if t == NEW_BLOCK:
-                # print("Streamer: NEW_BLOCK token encountered.")
-                # NEW_BLOCK indicates the start of audio, might reset buffer if needed
                 self.buf.clear()
-                continue # Move to the next token
-            # Check if token is within the expected audio range
             if AUDIO_BASE <= t < AUDIO_BASE + AUDIO_SPAN:
-                # Store value relative to base (IMPORTANT for _decode_block)
-                self.buf.append(t - AUDIO_BASE)
-            else:
-                # Log unexpected tokens (like START_TOKEN or others if generation goes wrong)
                 # print(f"Streamer Warning: Ignoring unexpected token {t}")
-                pass # Ignore tokens outside the audio range
-            # If buffer has 7 tokens, decode and send
             if len(self.buf) == 7:
                 audio_bytes = self._decode_block(self.buf)
-                self.buf.clear() # Clear buffer after processing
-                if audio_bytes: # Only send if decoding was successful
-                    # Schedule the async send function to run on the main event loop
                     future = asyncio.run_coroutine_threadsafe(self._send_audio_bytes(audio_bytes), self.loop)
                     self.tasks.add(future)
-                    # Optional: Remove completed tasks to prevent memory leak if generation is very long
                     future.add_done_callback(self.tasks.discard)
-                    # Allow EOS only after the first full block has been processed and scheduled for sending
                     if self.masker.sent_blocks == 0:
-                        # print("Streamer: First audio block processed, allowing EOS.")
-                        self.masker.sent_blocks = 1 # Update state in the mask
-        # Note: No need to explicitly wait for tasks here. put() should return quickly.
     def end(self):
         """Called by generate() when generation finishes."""
-        # Handle any remaining tokens in the buffer (optional, here we discard them)
         if len(self.buf) > 0:
             print(f"Streamer: End of generation with incomplete block ({len(self.buf)} tokens). Discarding.")
             self.buf.clear()
-        # Optional: Wait briefly for any outstanding send tasks to complete?
-        # This is tricky because end() is sync. A robust solution might involve
-        # signaling the WebSocket handler to wait before closing.
-        # For simplicity, we rely on FastAPI/Uvicorn's graceful shutdown handling.
         # print(f"Streamer: Generation finished. Pending send tasks: {len(self.tasks)}")
         pass
@@ -227,7 +208,7 @@ class AudioStreamer(BaseStreamer):
 app = FastAPI()
 @app.on_event("startup")
-async def load_models_startup(): # Make startup async if needed for future async loads
     global tok, model, snac, masker, stopping_criteria, device, AUDIO_IDS_CPU
     print(f"🚀 Starting up on device: {device}")
@@ -236,41 +217,32 @@ async def load_models_startup(): # Make startup async if needed for future async
     tok = AutoTokenizer.from_pretrained(REPO)
     print("Tokenizer loaded.")
-    # Load SNAC first (usually smaller)
     snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(device)
-    # --- FIXED Print statement ---
     print(f"SNAC loaded to {device}.") # Use the global device variable
-    # Load the main model
-    # Determine appropriate dtype based on device and support
-    model_dtype = torch.float32 # Default to float32 for CPU
     if device == "cuda":
         if torch.cuda.is_bf16_supported():
             model_dtype = torch.bfloat16
             print("Using bfloat16 for model.")
         else:
-            model_dtype = torch.float16 # Fallback to float16 if bfloat16 not supported
             print("Using float16 for model.")
     model = AutoModelForCausalLM.from_pretrained(
         REPO,
-        device_map={"": 0} if device == "cuda" else None, # Assign to GPU 0 if cuda
         torch_dtype=model_dtype,
-        low_cpu_mem_usage=True, # Good practice for large models
     )
-    model.config.pad_token_id = model.config.eos_token_id # Set pad token
     print(f"Model loaded to {model.device} with dtype {model.dtype}.")
-    # Ensure model is in evaluation mode
     model.eval()
-    # Initialize AudioMask (needs AUDIO_IDS on the correct device)
     audio_ids_device = AUDIO_IDS_CPU.to(device)
     masker = AudioMask(audio_ids_device, NEW_BLOCK, EOS_TOKEN)
     print("AudioMask initialized.")
-    # Initialize StoppingCriteria
-    # IMPORTANT: Create the list and add the criteria instance
     stopping_criteria = StoppingCriteriaList([EosStoppingCriteria(EOS_TOKEN)])
     print("StoppingCriteria initialized.")
@@ -283,18 +255,15 @@ def hello():
 # 6) Helper zum Prompt Bauen -------------------------------------------
 def build_prompt(text: str, voice: str) -> tuple[torch.Tensor, torch.Tensor]:
     """Builds the input_ids and attention_mask for the model."""
-    # Format: <START> <VOICE>: <TEXT> <NEW_BLOCK>
     prompt_text = f"{voice}: {text}"
     prompt_ids = tok(prompt_text, return_tensors="pt").input_ids.to(device)
-    # Construct input_ids tensor
     input_ids = torch.cat([
-        torch.tensor([[START_TOKEN]], device=device, dtype=torch.long), # Start token
-        prompt_ids,                                  # Encoded prompt
-        torch.tensor([[NEW_BLOCK]], device=device, dtype=torch.long)   # New block token to trigger audio
     ], dim=1)
-    # Create attention mask (all ones)
     attention_mask = torch.ones_like(input_ids)
     return input_ids, attention_mask
@@ -303,49 +272,37 @@ def build_prompt(text: str, voice: str) -> tuple[torch.Tensor, torch.Tensor]:
 async def tts(ws: WebSocket):
     await ws.accept()
     print("🔌 Client connected")
-    streamer = None # Initialize for finally block
-    main_loop = asyncio.get_running_loop() # Get the current event loop
     try:
-        # Receive configuration
         req_text = await ws.receive_text()
         print(f"Received request: {req_text}")
         req = json.loads(req_text)
-        text = req.get("text", "Hallo Welt, wie geht es dir heute?") # Default text
-        voice = req.get("voice", "Jakob") # Default voice
         if not text:
             print("⚠️ Request text is empty.")
-            await ws.close(code=1003, reason="Text cannot be empty") # 1003 = Cannot accept data type
             return
         print(f"Generating audio for: '{text}' with voice '{voice}'")
-        # Prepare prompt
         ids, attn = build_prompt(text, voice)
-        # --- Reset stateful components ---
-        masker.reset() # CRITICAL: Reset the mask state for the new request
-        # --- Create Streamer Instance ---
-        # --- Pass the global 'device' variable ---
         streamer = AudioStreamer(ws, snac, masker, main_loop, device)
-        # --- Run model.generate in a separate thread ---
-        # This prevents blocking the main FastAPI event loop
         print("Starting generation in background thread...")
         await asyncio.to_thread(
             model.generate,
             input_ids=ids,
             attention_mask=attn,
-            max_new_tokens=1500, # Limit generation length (adjust as needed)
             logits_processor=[masker],
             stopping_criteria=stopping_criteria,
-            do_sample=False, # Use greedy decoding for potentially more stable audio
-            # do_sample=True, temperature=0.7, top_p=0.95, # Or use sampling
             use_cache=True,
-            streamer=streamer # Pass the custom streamer
-            # No need to manage past_key_values manually
         )
         print("Generation thread finished.")
@@ -358,32 +315,26 @@ async def tts(ws: WebSocket):
     except Exception as e:
         error_details = traceback.format_exc()
         print(f"❌ WS‑Error: {e}\n{error_details}", flush=True)
-        # Try to send an error message before closing, if possible
         error_payload = json.dumps({"error": str(e)})
         try:
             if ws.client_state.name == "CONNECTED":
-                 await ws.send_text(error_payload) # Send error as text/json
         except Exception:
-            pass # Ignore error during error reporting
-        # Close with internal server error code
         if ws.client_state.name == "CONNECTED":
-            await ws.close(code=1011) # 1011 = Internal Server Error
     finally:
-        # Ensure streamer's end method is called if it exists
         if streamer:
             try:
-                # print("Calling streamer.end()")
                 streamer.end()
             except Exception as e_end:
                  print(f"Error during streamer.end(): {e_end}")
-        # Ensure WebSocket is closed
         print("Closing connection.")
         if ws.client_state.name == "CONNECTED":
             try:
-                await ws.close(code=1000) # 1000 = Normal Closure
             except RuntimeError as e_close:
-                 # Can happen if connection is already closing/closed
                  print(f"Runtime error closing websocket: {e_close}")
             except Exception as e_close_final:
                  print(f"Error closing websocket: {e_close_final}")
@@ -395,6 +346,4 @@ async def tts(ws: WebSocket):
 if __name__ == "__main__":
     import uvicorn
     print("Starting Uvicorn server...")
-    # Use reload=True only for development, remove for production
-    # Consider adding --workers 1 if you experience issues with multiple workers and global state/GPU memory
-    uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info") #, reload=True)

 EOS_TOKEN = 128258
 AUDIO_BASE = 128266
 AUDIO_SPAN = 4096 * 7  # 28672 Codes
+CODEBOOK_SIZE = 4096  # Explicitly define the codebook size
 # Create AUDIO_IDS on the correct device later in load_models
 AUDIO_IDS_CPU = torch.arange(AUDIO_BASE, AUDIO_BASE + AUDIO_SPAN)
         super().__init__()
         # Allow NEW_BLOCK and all valid audio tokens initially
         self.allow = torch.cat([
+            torch.tensor([new_block_token_id], device=audio_ids.device, dtype=torch.long),
             audio_ids
         ], dim=0)
+        self.eos = torch.tensor([eos_token_id], device=audio_ids.device, dtype=torch.long)
+        self.allow_with_eos = torch.cat([self.allow, self.eos], dim=0)
         self.sent_blocks = 0 # State: Number of audio blocks sent
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         current_allow = self.allow_with_eos if self.sent_blocks > 0 else self.allow
         mask = torch.full_like(scores, float("-inf"))
         mask[:, current_allow] = 0
         return scores + mask
     def reset(self):
         self.sent_blocks = 0
 # 3) StoppingCriteria für EOS ---------------------------------------
 class EosStoppingCriteria(StoppingCriteria):
     def __init__(self, eos_token_id: int):
         self.eos_token_id = eos_token_id
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
         if input_ids.shape[1] > 0 and input_ids[:, -1] == self.eos_token_id:
             return True
         return False
 # 4) Benutzerdefinierter AudioStreamer -------------------------------
 class AudioStreamer(BaseStreamer):
     def __init__(self, ws: WebSocket, snac_decoder: SNAC, audio_mask: AudioMask, loop: asyncio.AbstractEventLoop, target_device: str):
         self.ws = ws
         self.snac = snac_decoder
+        self.masker = audio_mask
+        self.loop = loop
         self.device = target_device
+        self.buf: list[int] = []
+        self.tasks = set()
     def _decode_block(self, block7: list[int]) -> bytes:
         """
         Decodes a block of 7 audio token values (AUDIO_BASE subtracted) into audio bytes.
+        NOTE: Extracts base code value (0-4095) using modulo, assuming
+              input values represent (slot_offset + code_value).
+              Maps extracted values using the structure potentially correct for Kartoffel_Orpheus.
         """
         if len(block7) != 7:
             print(f"Streamer Warning: _decode_block received {len(block7)} tokens, expected 7. Skipping.")
+            return b""
         try:
+            # --- Extract base code value (0 to CODEBOOK_SIZE-1) for each slot using modulo ---
+            code_val_0 = block7[0] % CODEBOOK_SIZE
+            code_val_1 = block7[1] % CODEBOOK_SIZE
+            code_val_2 = block7[2] % CODEBOOK_SIZE
+            code_val_3 = block7[3] % CODEBOOK_SIZE
+            code_val_4 = block7[4] % CODEBOOK_SIZE
+            code_val_5 = block7[5] % CODEBOOK_SIZE
+            code_val_6 = block7[6] % CODEBOOK_SIZE
+            # --- Map the extracted code values to the SNAC codebooks (l1, l2, l3) ---
+            # Using the structure from the user's previous version, believed to be correct
+            l1 = [code_val_0]
+            l2 = [code_val_1, code_val_4]
+            l3 = [code_val_2, code_val_3, code_val_5, code_val_6]
         except IndexError:
             print(f"Streamer Error: Index out of bounds during token mapping. Block: {block7}")
             return b""
+        except Exception as e_map: # Catch potential issues with modulo/mapping
+            print(f"Streamer Error: Exception during code value extraction/mapping: {e_map}. Block: {block7}")
+            return b""
+        # --- Convert lists to tensors on the correct device ---
+        try:
+            codes_l1 = torch.tensor(l1, dtype=torch.long, device=self.device).unsqueeze(0)
+            codes_l2 = torch.tensor(l2, dtype=torch.long, device=self.device).unsqueeze(0)
+            codes_l3 = torch.tensor(l3, dtype=torch.long, device=self.device).unsqueeze(0)
+            codes = [codes_l1, codes_l2, codes_l3]
+        except Exception as e_tensor:
+            print(f"Streamer Error: Exception during tensor conversion: {e_tensor}. l1={l1}, l2={l2}, l3={l3}")
+            return b""
+        # --- Decode using SNAC ---
+        try:
+            with torch.no_grad():
+                # self.snac should already be on self.device from load_models_startup
+                audio = self.snac.decode(codes)[0] # Decode expects list of tensors, result might have batch dim
+        except Exception as e_decode:
+            # Add more detailed logging here if it fails again
+            print(f"Streamer Error: Exception during snac.decode: {e_decode}")
+            print(f"Input codes shapes: {[c.shape for c in codes]}")
+            print(f"Input codes dtypes: {[c.dtype for c in codes]}")
+            print(f"Input codes devices: {[c.device for c in codes]}")
+            # Avoid printing potentially huge lists, maybe just check min/max?
+            print(f"Input code values (min/max): L1({min(l1)}/{max(l1)}) L2({min(l2)}/{max(l2)}) L3({min(l3)}/{max(l3)})")
+            return b""
+        # --- Post-processing ---
+        try:
+            audio_np = audio.squeeze().detach().cpu().numpy()
+            audio_bytes = (audio_np * 32767).astype("int16").tobytes()
+            return audio_bytes
+        except Exception as e_post:
+            print(f"Streamer Error: Exception during post-processing: {e_post}. Audio tensor shape: {audio.shape}")
+            return b""
     async def _send_audio_bytes(self, data: bytes):
         """Coroutine to send bytes over WebSocket."""
+        if not data:
             return
         try:
             await self.ws.send_bytes(data)
         except WebSocketDisconnect:
             print("Streamer: WebSocket disconnected during send.")
         except Exception as e:
     def put(self, value: torch.LongTensor):
         """
+        Receives new token IDs (Tensor) from generate().
+        Processes tokens, decodes full blocks, and schedules sending.
         """
         if value.numel() == 0:
             return
         new_token_ids = value.squeeze().tolist()
+        if isinstance(new_token_ids, int):
             new_token_ids = [new_token_ids]
         for t in new_token_ids:
             if t == EOS_TOKEN:
+                break
             if t == NEW_BLOCK:
                 self.buf.clear()
+                continue
             if AUDIO_BASE <= t < AUDIO_BASE + AUDIO_SPAN:
+                self.buf.append(t - AUDIO_BASE) # Store value relative to base
+            # else: # Optionally log ignored tokens
                 # print(f"Streamer Warning: Ignoring unexpected token {t}")
             if len(self.buf) == 7:
                 audio_bytes = self._decode_block(self.buf)
+                self.buf.clear()
+                if audio_bytes:
                     future = asyncio.run_coroutine_threadsafe(self._send_audio_bytes(audio_bytes), self.loop)
                     self.tasks.add(future)
                     future.add_done_callback(self.tasks.discard)
                     if self.masker.sent_blocks == 0:
+                        self.masker.sent_blocks = 1
     def end(self):
         """Called by generate() when generation finishes."""
         if len(self.buf) > 0:
             print(f"Streamer: End of generation with incomplete block ({len(self.buf)} tokens). Discarding.")
             self.buf.clear()
         # print(f"Streamer: Generation finished. Pending send tasks: {len(self.tasks)}")
         pass
 app = FastAPI()
 @app.on_event("startup")
+async def load_models_startup():
     global tok, model, snac, masker, stopping_criteria, device, AUDIO_IDS_CPU
     print(f"🚀 Starting up on device: {device}")
     tok = AutoTokenizer.from_pretrained(REPO)
     print("Tokenizer loaded.")
     snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(device)
     print(f"SNAC loaded to {device}.") # Use the global device variable
+    model_dtype = torch.float32
     if device == "cuda":
         if torch.cuda.is_bf16_supported():
             model_dtype = torch.bfloat16
             print("Using bfloat16 for model.")
         else:
+            model_dtype = torch.float16
             print("Using float16 for model.")
     model = AutoModelForCausalLM.from_pretrained(
         REPO,
+        device_map={"": 0} if device == "cuda" else None,
         torch_dtype=model_dtype,
+        low_cpu_mem_usage=True,
     )
+    model.config.pad_token_id = model.config.eos_token_id
     print(f"Model loaded to {model.device} with dtype {model.dtype}.")
     model.eval()
     audio_ids_device = AUDIO_IDS_CPU.to(device)
     masker = AudioMask(audio_ids_device, NEW_BLOCK, EOS_TOKEN)
     print("AudioMask initialized.")
     stopping_criteria = StoppingCriteriaList([EosStoppingCriteria(EOS_TOKEN)])
     print("StoppingCriteria initialized.")
 # 6) Helper zum Prompt Bauen -------------------------------------------
 def build_prompt(text: str, voice: str) -> tuple[torch.Tensor, torch.Tensor]:
     """Builds the input_ids and attention_mask for the model."""
     prompt_text = f"{voice}: {text}"
     prompt_ids = tok(prompt_text, return_tensors="pt").input_ids.to(device)
     input_ids = torch.cat([
+        torch.tensor([[START_TOKEN]], device=device, dtype=torch.long),
+        prompt_ids,
+        torch.tensor([[NEW_BLOCK]], device=device, dtype=torch.long)
     ], dim=1)
     attention_mask = torch.ones_like(input_ids)
     return input_ids, attention_mask
 async def tts(ws: WebSocket):
     await ws.accept()
     print("🔌 Client connected")
+    streamer = None
+    main_loop = asyncio.get_running_loop()
     try:
         req_text = await ws.receive_text()
         print(f"Received request: {req_text}")
         req = json.loads(req_text)
+        text = req.get("text", "Hallo Welt, wie geht es dir heute?")
+        voice = req.get("voice", "Jakob")
         if not text:
             print("⚠️ Request text is empty.")
+            await ws.close(code=1003, reason="Text cannot be empty")
             return
         print(f"Generating audio for: '{text}' with voice '{voice}'")
         ids, attn = build_prompt(text, voice)
+        masker.reset()
         streamer = AudioStreamer(ws, snac, masker, main_loop, device)
         print("Starting generation in background thread...")
         await asyncio.to_thread(
             model.generate,
             input_ids=ids,
             attention_mask=attn,
+            max_new_tokens=1500,
             logits_processor=[masker],
             stopping_criteria=stopping_criteria,
+            do_sample=False, # Using greedy decoding
             use_cache=True,
+            streamer=streamer
         )
         print("Generation thread finished.")
     except Exception as e:
         error_details = traceback.format_exc()
         print(f"❌ WS‑Error: {e}\n{error_details}", flush=True)
         error_payload = json.dumps({"error": str(e)})
         try:
             if ws.client_state.name == "CONNECTED":
+                 await ws.send_text(error_payload)
         except Exception:
+            pass
         if ws.client_state.name == "CONNECTED":
+            await ws.close(code=1011)
     finally:
         if streamer:
             try:
                 streamer.end()
             except Exception as e_end:
                  print(f"Error during streamer.end(): {e_end}")
         print("Closing connection.")
         if ws.client_state.name == "CONNECTED":
             try:
+                await ws.close(code=1000)
             except RuntimeError as e_close:
                  print(f"Runtime error closing websocket: {e_close}")
             except Exception as e_close_final:
                  print(f"Error closing websocket: {e_close_final}")
 if __name__ == "__main__":
     import uvicorn
     print("Starting Uvicorn server...")
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info")