dev-mode-orpheus-tts

Paused

App Files Files Community

Tomtom84 commited on Jun 8

Commit

4715aa2

verified ·

1 Parent(s): be98391

Update app.py

Browse files

Files changed (1) hide show

app.py +381 -378

app.py CHANGED Viewed

@@ -1,398 +1,401 @@
-# app.py ──────────────────────────────────────────────────────────────
 import os
-import json
-import torch
-import asyncio
-import traceback # Import traceback for better error logging
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect
-from huggingface_hub import login
-from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, StoppingCriteria, StoppingCriteriaList
-# Import BaseStreamer for the interface
-from transformers.generation.streamers import BaseStreamer
-from snac import SNAC # Ensure you have 'pip install snac'
-# --- Globals (populated in load_models) ---
-tok = None
-model = None
-snac = None
-masker = None
-stopping_criteria = None
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# 0) Login + Device ---------------------------------------------------
-HF_TOKEN = os.getenv("HF_TOKEN")
-if HF_TOKEN:
-    print("🔑 Logging in to Hugging Face Hub...")
-    login(HF_TOKEN)
-# torch.backends.cuda.enable_flash_sdp(False) # Uncomment if needed for PyTorch‑2.2‑Bug
-# 1) Konstanten -------------------------------------------------------
-REPO = "SebastianBodza/Kartoffel_Orpheus-3B_german_natural-v0.1"
-START_TOKEN = 128259
-NEW_BLOCK = 128257
-EOS_TOKEN = 128258 # Ensure this is correct for the model
-AUDIO_BASE = 128266
-AUDIO_SPAN = 4096 * 7  # 28672 Codes
-CODEBOOK_SIZE = 4096  # Explicitly define the codebook size
-AUDIO_IDS_CPU = torch.arange(AUDIO_BASE, AUDIO_BASE + AUDIO_SPAN)
-# 2) Logit‑Mask -------------------------------------------------------
-class AudioMask(LogitsProcessor):
-    def __init__(self, audio_ids: torch.Tensor, new_block_token_id: int, eos_token_id: int):
-        super().__init__()
-        self.allow = torch.cat([
-            torch.tensor([new_block_token_id], device=audio_ids.device, dtype=torch.long),
-            audio_ids
-        ], dim=0)
-        self.eos = torch.tensor([eos_token_id], device=audio_ids.device, dtype=torch.long)
-        self.allow_with_eos = torch.cat([self.allow, self.eos], dim=0)
-        self.sent_blocks = 0 # State: Number of audio blocks sent
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        current_allow = self.allow_with_eos if self.sent_blocks > 0 else self.allow
-        mask = torch.full_like(scores, float("-inf"))
-        mask[:, current_allow] = 0
-        return scores + mask
-    def reset(self):
-        self.sent_blocks = 0
-# 3) StoppingCriteria für EOS ---------------------------------------
-class EosStoppingCriteria(StoppingCriteria):
-    def __init__(self, eos_token_id: int):
-        self.eos_token_id = eos_token_id
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        if input_ids.shape[1] > 0 and input_ids[:, -1] == self.eos_token_id:
-            # print("StoppingCriteria: EOS detected.") # Optional: Uncomment for debugging
-            return True
-        return False
-# 4) Benutzerdefinierter AudioStreamer -------------------------------
-class AudioStreamer(BaseStreamer):
-    def __init__(self, ws: WebSocket, snac_decoder: SNAC, audio_mask: AudioMask, loop: asyncio.AbstractEventLoop, target_device: str):
-        self.ws = ws
-        self.snac = snac_decoder
-        self.masker = audio_mask
-        self.loop = loop
-        self.device = target_device
-        self.buf: list[int] = []
-        self.tasks = set()
-    def _decode_block(self, block7: list[int]) -> bytes:
-        """
-        Decodes a block of 7 audio token values (AUDIO_BASE subtracted) into audio bytes.
-        Uses modulo to extract base code value (0-4095).
-        Maps extracted values using the structure potentially correct for Kartoffel_Orpheus.
-        """
-        if len(block7) != 7:
-            print(f"Streamer Warning: _decode_block received {len(block7)} tokens, expected 7. Skipping.")
-            return b""
         try:
-            # --- Extract base code value (0 to CODEBOOK_SIZE-1) for each slot using modulo ---
-            code_val_0 = block7[0] % CODEBOOK_SIZE
-            code_val_1 = block7[1] % CODEBOOK_SIZE
-            code_val_2 = block7[2] % CODEBOOK_SIZE
-            code_val_3 = block7[3] % CODEBOOK_SIZE
-            code_val_4 = block7[4] % CODEBOOK_SIZE
-            code_val_5 = block7[5] % CODEBOOK_SIZE
-            code_val_6 = block7[6] % CODEBOOK_SIZE
-            # --- Map the extracted code values to the SNAC codebooks (l1, l2, l3) ---
-            l1 = [code_val_0]
-            l2 = [code_val_1, code_val_4]
-            l3 = [code_val_2, code_val_3, code_val_5, code_val_6]
-        except IndexError:
-            print(f"Streamer Error: Index out of bounds during token mapping. Block: {block7}")
-            return b""
-        except Exception as e_map:
-            print(f"Streamer Error: Exception during code value extraction/mapping: {e_map}. Block: {block7}")
-            return b""
-        # --- Convert lists to tensors on the correct device ---
-        try:
-            codes_l1 = torch.tensor(l1, dtype=torch.long, device=self.device).unsqueeze(0)
-            codes_l2 = torch.tensor(l2, dtype=torch.long, device=self.device).unsqueeze(0)
-            codes_l3 = torch.tensor(l3, dtype=torch.long, device=self.device).unsqueeze(0)
-            codes = [codes_l1, codes_l2, codes_l3]
-        except Exception as e_tensor:
-            print(f"Streamer Error: Exception during tensor conversion: {e_tensor}. l1={l1}, l2={l2}, l3={l3}")
-            return b""
-        # --- Decode using SNAC ---
-        try:
-            with torch.no_grad():
-                audio = self.snac.decode(codes)[0]
-        except Exception as e_decode:
-            print(f"Streamer Error: Exception during snac.decode: {e_decode}")
-            print(f"Input codes shapes: {[c.shape for c in codes]}")
-            print(f"Input codes dtypes: {[c.dtype for c in codes]}")
-            print(f"Input codes devices: {[c.device for c in codes]}")
-            print(f"Input code values (min/max): L1({min(l1)}/{max(l1)}) L2({min(l2)}/{max(l2)}) L3({min(l3)}/{max(l3)})")
-            return b""
-        # --- Post-processing ---
-        try:
-            audio_np = audio.squeeze().detach().cpu().numpy()
-            audio_bytes = (audio_np * 32767).astype("int16").tobytes()
-            return audio_bytes
-        except Exception as e_post:
-            print(f"Streamer Error: Exception during post-processing: {e_post}. Audio tensor shape: {audio.shape}")
-            return b""
-    async def _send_audio_bytes(self, data: bytes):
-        """Coroutine to send bytes over WebSocket."""
-        if not data:
-            return
-        try:
-            await self.ws.send_bytes(data)
-        except WebSocketDisconnect:
-            print("Streamer: WebSocket disconnected during send.")
         except Exception as e:
-            # Log errors other than expected disconnects more visibly maybe
-            if "Cannot call \"send\" once a close message has been sent" not in str(e):
-                 print(f"Streamer: Error sending bytes: {e}")
-            # else: # Optionally print disconnect errors quietly
-            #    print("Streamer: Attempted send after close.")
-            pass # Avoid flooding logs if client disconnects early
-    def put(self, value: torch.LongTensor):
-        """
-        Receives new token IDs (Tensor) from generate().
-        Processes tokens, decodes full blocks, and schedules sending.
-        """
-        if value.numel() == 0:
-            return
-        # Ensure value is on CPU and flatten to a list of ints
-        new_token_ids = value.squeeze().cpu().tolist() # Move to CPU before list conversion
-        if isinstance(new_token_ids, int):
-            new_token_ids = [new_token_ids]
-        for t in new_token_ids:
-            # --- DEBUGGING PRINT ---
-            # Log every token ID received from the model
-            print(f"Streamer received token ID: {t}")
-            # --- END DEBUGGING ---
-            if t == EOS_TOKEN:
-                # print("Streamer: EOS token encountered.") # Optional debugging
-                break # Stop processing this batch if EOS is found
-            if t == NEW_BLOCK:
-                # print("Streamer: NEW_BLOCK token encountered.") # Optional debugging
-                self.buf.clear()
-                continue # Move to the next token
-            # Check if token is within the expected audio range
-            if AUDIO_BASE <= t < AUDIO_BASE + AUDIO_SPAN:
-                self.buf.append(t - AUDIO_BASE) # Store value relative to base
-            # else: # Log unexpected tokens if needed
-                # print(f"Streamer Warning: Ignoring unexpected token {t} (outside audio range [{AUDIO_BASE}, {AUDIO_BASE + AUDIO_SPAN}))")
-                pass
-            # If buffer has 7 tokens, decode and send
-            if len(self.buf) == 7:
-                audio_bytes = self._decode_block(self.buf)
-                self.buf.clear() # Clear buffer after processing
-                if audio_bytes: # Only send if decoding was successful
-                    # Schedule the async send function to run on the main event loop
-                    future = asyncio.run_coroutine_threadsafe(self._send_audio_bytes(audio_bytes), self.loop)
-                    self.tasks.add(future)
-                    # Optional: Remove completed tasks to prevent memory leak if generation is very long
-                    future.add_done_callback(self.tasks.discard)
-                    # Allow EOS only after the first full block has been processed and scheduled for sending
-                    if self.masker.sent_blocks == 0:
-                        # print("Streamer: First audio block processed, allowing EOS.")
-                        self.masker.sent_blocks = 1 # Update state in the mask
-    def end(self):
-        """Called by generate() when generation finishes."""
-        if len(self.buf) > 0:
-            print(f"Streamer: End of generation with incomplete block ({len(self.buf)} tokens). Discarding.")
-            self.buf.clear()
-        # print(f"Streamer: Generation finished.") # Optional debugging
-        pass
-# 5) FastAPI App ------------------------------------------------------
-app = FastAPI()
-@app.on_event("startup")
-async def load_models_startup():
-    global tok, model, snac, masker, stopping_criteria, device, AUDIO_IDS_CPU, EOS_TOKEN
-    print(f"🚀 Starting up on device: {device}")
-    print("⏳ Lade Modelle …", flush=True)
-    tok = AutoTokenizer.from_pretrained(REPO)
-    print("Tokenizer loaded.")
-    snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(device)
-    print(f"SNAC loaded to {device}.")
-    model_dtype = torch.float32
-    if device == "cuda":
-        if torch.cuda.is_bf16_supported():
-            model_dtype = torch.bfloat16
-            print("Using bfloat16 for model.")
-        else:
-            model_dtype = torch.float16
-            print("Using float16 for model.")
-    model = AutoModelForCausalLM.from_pretrained(
-        REPO,
-        device_map={"": 0} if device == "cuda" else None,
-        torch_dtype=model_dtype,
-        low_cpu_mem_usage=True,
-    )
-    # --- Verify EOS Token ---
-    # Use the actual EOS token ID from the loaded model/tokenizer config
-    config_eos_id = model.config.eos_token_id
-    tokenizer_eos_id = tok.eos_token_id
-    if config_eos_id is None:
-        print("🚨 WARNING: model.config.eos_token_id is None!")
-        # Fallback or default? Let's use the constant for now, but this needs checking.
-        final_eos_token_id = EOS_TOKEN
-    elif tokenizer_eos_id is not None and config_eos_id != tokenizer_eos_id:
-         print(f"⚠️ WARNING: Mismatch! model.config.eos_token_id ({config_eos_id}) != tok.eos_token_id ({tokenizer_eos_id}). Using model config ID.")
-         final_eos_token_id = config_eos_id
     else:
-         final_eos_token_id = config_eos_id
-    # Update the global constant if it differs or wasn't set properly by config
-    if final_eos_token_id != EOS_TOKEN:
-         print(f"🔄 Updating EOS_TOKEN constant from {EOS_TOKEN} to {final_eos_token_id}")
-         EOS_TOKEN = final_eos_token_id # Update the global constant
-    # Set pad_token_id to the determined EOS token ID
-    model.config.pad_token_id = EOS_TOKEN
-    print(f"Using EOS Token ID: {EOS_TOKEN}")
-    # --- End Verify EOS Token ---
-    print(f"Model loaded to {model.device} with dtype {model.dtype}.")
-    model.eval()
-    audio_ids_device = AUDIO_IDS_CPU.to(device)
-    masker = AudioMask(audio_ids_device, NEW_BLOCK, EOS_TOKEN) # Use updated EOS_TOKEN
-    print("AudioMask initialized.")
-    stopping_criteria = StoppingCriteriaList([EosStoppingCriteria(EOS_TOKEN)]) # Use updated EOS_TOKEN
-    print("StoppingCriteria initialized.")
-    print("✅ Modelle geladen und bereit!", flush=True)
-@app.get("/")
-def hello():
-    return {"status": "ok", "message": "TTS Service is running"}
-# 6) Helper zum Prompt Bauen -------------------------------------------
-def build_prompt(text: str, voice: str) -> tuple[torch.Tensor, torch.Tensor]:
-    """Builds the input_ids and attention_mask for the model."""
-    prompt_text = f"{voice}: {text}"
-    prompt_ids = tok(prompt_text, return_tensors="pt").input_ids.to(device)
-    input_ids = torch.cat([
-        torch.tensor([[START_TOKEN]], device=device, dtype=torch.long),
-        prompt_ids,
-        torch.tensor([[NEW_BLOCK]], device=device, dtype=torch.long)
-    ], dim=1)
-    attention_mask = torch.ones_like(input_ids)
-    return input_ids, attention_mask
-# 7) WebSocket‑Endpoint (vereinfacht mit Streamer) ---------------------
-@app.websocket("/ws/tts")
-async def tts(ws: WebSocket):
-    await ws.accept()
-    print("🔌 Client connected")
-    streamer = None
-    main_loop = asyncio.get_running_loop()
-    try:
-        req_text = await ws.receive_text()
-        print(f"Received request: {req_text}")
-        req = json.loads(req_text)
-        text = req.get("text", "Hallo Welt, wie geht es dir heute?")
-        voice = req.get("voice", "Jakob")
-        if not text:
-            print("⚠️ Request text is empty.")
-            await ws.close(code=1003, reason="Text cannot be empty")
-            return
-        print(f"Generating audio for: '{text}' with voice '{voice}'")
-        ids, attn = build_prompt(text, voice)
-        masker.reset()
-        streamer = AudioStreamer(ws, snac, masker, main_loop, device)
-        print("Starting generation in background thread...")
-        # --- DEBUGGING: Adjusted Generation Parameters ---
-        await asyncio.to_thread(
-            model.generate,
-            input_ids=ids,
-            attention_mask=attn,
-            max_new_tokens=1500, # Keep lower for faster debugging cycles initially
-            logits_processor=[masker],
-            stopping_criteria=stopping_criteria,
-            # --- Adjusted Parameters for Debugging Repetition ---
-            do_sample=True,
-            temperature=0.7,     # Slightly higher temperature
-            # top_p=0.9,         # Commented out top_p for simpler testing
-            repetition_penalty=1.2, # Slightly stronger penalty
-            # --- End Adjusted Parameters ---
-            use_cache=True,
-            streamer=streamer
         )
-        print("Generation thread finished.")
-    except WebSocketDisconnect:
-        print("🔌 Client disconnected.")
-    except json.JSONDecodeError:
-        print("❌ Invalid JSON received.")
-        if ws.client_state.name == "CONNECTED":
-            await ws.close(code=1003, reason="Invalid JSON format")
     except Exception as e:
-        error_details = traceback.format_exc()
-        print(f"❌ WS‑Error: {e}\n{error_details}", flush=True)
-        error_payload = json.dumps({"error": str(e)})
-        try:
-            if ws.client_state.name == "CONNECTED":
-                 await ws.send_text(error_payload)
-        except Exception:
-            pass
-        if ws.client_state.name == "CONNECTED":
-            await ws.close(code=1011)
-    finally:
-        if streamer:
-            try:
-                streamer.end()
-            except Exception as e_end:
-                 print(f"Error during streamer.end(): {e_end}")
-        print("Closing connection.")
-        if ws.client_state.name == "CONNECTED":
-            try:
-                await ws.close(code=1000)
-            except RuntimeError as e_close:
-                 print(f"Runtime error closing websocket: {e_close}")
-            except Exception as e_close_final:
-                 print(f"Error closing websocket: {e_close_final}")
-        elif ws.client_state.name != "DISCONNECTED":
-             print(f"WebSocket final state: {ws.client_state.name}")
-        print("Connection closed.")
-# 8) Dev‑Start --------------------------------------------------------
 if __name__ == "__main__":
-    import uvicorn
-    print("Starting Uvicorn server...")
-    # Note: Consider running with --workers 1 if you face issues with globals/GPU memory
-    # uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info", workers=1)
-    uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info")

+if __name__ == "__main__":
+    print("Starting server")
+    import logging
+    # Enable or disable debug logging
+    DEBUG_LOGGING = False
+    if DEBUG_LOGGING:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.WARNING)
+from RealtimeTTS import (
+    TextToAudioStream,
+    AzureEngine,
+    ElevenlabsEngine,
+    SystemEngine,
+    CoquiEngine,
+    OpenAIEngine,
+    KokoroEngine
+)
+from RealtimeTTS import register_engine
+from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi import FastAPI, Query, Request
+from fastapi.staticfiles import StaticFiles
+from queue import Queue
+import threading
+import logging
+import uvicorn
+import wave
+import io
 import os
+PORT = int(os.environ.get("TTS_FASTAPI_PORT", 8000))
+register_engine("orpheus", OrpheusEngine)
+SUPPORTED_ENGINES = [
+    "azure",
+    "openai",
+    "elevenlabs",
+    "system",
+    # "coqui",  #multiple queries are not supported on coqui engine right now, comment coqui out for tests where you need server start often,
+    "kokoro"
+]
+# change start engine by moving engine name
+# to the first position in SUPPORTED_ENGINES
+START_ENGINE = SUPPORTED_ENGINES[0]
+BROWSER_IDENTIFIERS = [
+    "mozilla",
+    "chrome",
+    "safari",
+    "firefox",
+    "edge",
+    "opera",
+    "msie",
+    "trident",
+]
+origins = [
+    "http://localhost",
+    f"http://localhost:{PORT}",
+    "http://127.0.0.1",
+    f"http://127.0.0.1:{PORT}",
+    "https://localhost",
+    f"https://localhost:{PORT}",
+    "https://127.0.0.1",
+    f"https://127.0.0.1:{PORT}",
+]
+play_text_to_speech_semaphore = threading.Semaphore(1)
+engines = {}
+voices = {}
+current_engine = None
+speaking_lock = threading.Lock()
+tts_lock = threading.Lock()
+gen_lock = threading.Lock()
+class TTSRequestHandler:
+    def __init__(self, engine):
+        self.engine = engine
+        self.audio_queue = Queue()
+        self.stream = TextToAudioStream(
+            engine, on_audio_stream_stop=self.on_audio_stream_stop, muted=True
+        )
+        self.speaking = False
+    def on_audio_chunk(self, chunk):
+        self.audio_queue.put(chunk)
+    def on_audio_stream_stop(self):
+        self.audio_queue.put(None)
+        self.speaking = False
+    def play_text_to_speech(self, text):
+        self.speaking = True
+        self.stream.feed(text)
+        logging.debug(f"Playing audio for text: {text}")
+        print(f'Synthesizing: "{text}"')
+        self.stream.play_async(on_audio_chunk=self.on_audio_chunk, muted=True)
+    def audio_chunk_generator(self, send_wave_headers):
+        first_chunk = False
         try:
+            while True:
+                chunk = self.audio_queue.get()
+                if chunk is None:
+                    print("Terminating stream")
+                    break
+                if not first_chunk:
+                    if send_wave_headers:
+                        print("Sending wave header")
+                        yield create_wave_header_for_engine(self.engine)
+                    first_chunk = True
+                yield chunk
         except Exception as e:
+            print(f"Error during streaming: {str(e)}")
+app = FastAPI()
+app.mount("/static", StaticFiles(directory="static"), name="static")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Define a CSP that allows 'self' for script sources for firefox
+csp = {
+    "default-src": "'self'",
+    "script-src": "'self'",
+    "style-src": "'self' 'unsafe-inline'",
+    "img-src": "'self' data:",
+    "font-src": "'self' data:",
+    "media-src": "'self' blob:",
+}
+csp_string = "; ".join(f"{key} {value}" for key, value in csp.items())
+@app.middleware("http")
+async def add_security_headers(request: Request, call_next):
+    response = await call_next(request)
+    response.headers["Content-Security-Policy"] = csp_string
+    return response
+@app.get("/favicon.ico")
+async def favicon():
+    return FileResponse("static/favicon.ico")
+def _set_engine(engine_name):
+    global current_engine, stream
+    if current_engine is None:
+        current_engine = engines[engine_name]
     else:
+        current_engine = engines[engine_name]
+    if voices[engine_name]:
+        engines[engine_name].set_voice(voices[engine_name][0].name)
+@app.get("/set_engine")
+def set_engine(request: Request, engine_name: str = Query(...)):
+    if engine_name not in engines:
+        return {"error": "Engine not supported"}
+    try:
+        _set_engine(engine_name)
+        return {"message": f"Switched to {engine_name} engine"}
+    except Exception as e:
+        logging.error(f"Error switching engine: {str(e)}")
+        return {"error": "Failed to switch engine"}
+def is_browser_request(request):
+    user_agent = request.headers.get("user-agent", "").lower()
+    is_browser = any(browser_id in user_agent for browser_id in BROWSER_IDENTIFIERS)
+    return is_browser
+def create_wave_header_for_engine(engine):
+    _, _, sample_rate = engine.get_stream_info()
+    num_channels = 1
+    sample_width = 2
+    frame_rate = sample_rate
+    wav_header = io.BytesIO()
+    with wave.open(wav_header, "wb") as wav_file:
+        wav_file.setnchannels(num_channels)
+        wav_file.setsampwidth(sample_width)
+        wav_file.setframerate(frame_rate)
+    wav_header.seek(0)
+    wave_header_bytes = wav_header.read()
+    wav_header.close()
+    # Create a new BytesIO with the correct MIME type for Firefox
+    final_wave_header = io.BytesIO()
+    final_wave_header.write(wave_header_bytes)
+    final_wave_header.seek(0)
+    return final_wave_header.getvalue()
+@app.get("/tts")
+async def tts(request: Request, text: str = Query(...)):
+    with tts_lock:
+        request_handler = TTSRequestHandler(current_engine)
+        browser_request = is_browser_request(request)
+        if play_text_to_speech_semaphore.acquire(blocking=False):
+            try:
+                threading.Thread(
+                    target=request_handler.play_text_to_speech,
+                    args=(text,),
+                    daemon=True,
+                ).start()
+            finally:
+                play_text_to_speech_semaphore.release()
+        return StreamingResponse(
+            request_handler.audio_chunk_generator(browser_request),
+            media_type="audio/wav",
         )
+@app.get("/engines")
+def get_engines():
+    return list(engines.keys())
+@app.get("/voices")
+def get_voices():
+    voices_list = []
+    for voice in voices[current_engine.engine_name]:
+        voices_list.append(voice.name)
+    return voices_list
+@app.get("/setvoice")
+def set_voice(request: Request, voice_name: str = Query(...)):
+    print(f"Getting request: {voice_name}")
+    if not current_engine:
+        print("No engine is currently selected")
+        return {"error": "No engine is currently selected"}
+    try:
+        print(f"Setting voice to {voice_name}")
+        current_engine.set_voice(voice_name)
+        return {"message": f"Voice set to {voice_name} successfully"}
     except Exception as e:
+        print(f"Error setting voice: {str(e)}")
+        logging.error(f"Error setting voice: {str(e)}")
+        return {"error": "Failed to set voice"}
+@app.get("/")
+def root_page():
+    engines_options = "".join(
+        [
+            f'<option value="{engine}">{engine.title()}</option>'
+            for engine in engines.keys()
+        ]
+    )
+    content = f"""
+    <!DOCTYPE html>
+    <html>
+        <head>
+            <title>Text-To-Speech</title>
+            <style>
+                body {{
+                    font-family: Arial, sans-serif;
+                    background-color: #f0f0f0;
+                    margin: 0;
+                    padding: 0;
+                }}
+                h2 {{
+                    color: #333;
+                    text-align: center;
+                }}
+                #container {{
+                    width: 80%;
+                    margin: 50px auto;
+                    background-color: #fff;
+                    border-radius: 10px;
+                    padding: 20px;
+                    box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+                }}
+                label {{
+                    font-weight: bold;
+                }}
+                select, textarea {{
+                    width: 100%;
+                    padding: 10px;
+                    margin: 10px 0;
+                    border: 1px solid #ccc;
+                    border-radius: 5px;
+                    box-sizing: border-box;
+                    font-size: 16px;
+                }}
+                button {{
+                    display: block;
+                    width: 100%;
+                    padding: 15px;
+                    background-color: #007bff;
+                    border: none;
+                    border-radius: 5px;
+                    color: #fff;
+                    font-size: 16px;
+                    cursor: pointer;
+                    transition: background-color 0.3s;
+                }}
+                button:hover {{
+                    background-color: #0056b3;
+                }}
+                audio {{
+                    width: 80%;
+                    margin: 10px auto;
+                    display: block;
+                }}
+            </style>
+        </head>
+        <body>
+            <div id="container">
+                <h2>Text to Speech</h2>
+                <label for="engine">Select Engine:</label>
+                <select id="engine">
+                    {engines_options}
+                </select>
+                <label for="voice">Select Voice:</label>
+                <select id="voice">
+                    <!-- Options will be dynamically populated by JavaScript -->
+                </select>
+                <textarea id="text" rows="4" cols="50" placeholder="Enter text here..."></textarea>
+                <button id="speakButton">Speak</button>
+                <audio id="audio" controls></audio> <!-- Hidden audio player -->
+            </div>
+            <script src="/static/tts.js"></script>
+        </body>
+    </html>
+    """
+    return HTMLResponse(content=content)
 if __name__ == "__main__":
+    print("Initializing TTS Engines")
+    for engine_name in SUPPORTED_ENGINES:
+        if "azure" == engine_name:
+            azure_api_key = os.environ.get("AZURE_SPEECH_KEY")
+            azure_region = os.environ.get("AZURE_SPEECH_REGION")
+            if azure_api_key and azure_region:
+                print("Initializing azure engine")
+                engines["azure"] = AzureEngine(azure_api_key, azure_region)
+        if "elevenlabs" == engine_name:
+            elevenlabs_api_key = os.environ.get("ELEVENLABS_API_KEY")
+            if elevenlabs_api_key:
+                print("Initializing elevenlabs engine")
+                engines["elevenlabs"] = ElevenlabsEngine(elevenlabs_api_key)
+        if "system" == engine_name:
+            print("Initializing system engine")
+            engines["system"] = SystemEngine()
+        if "coqui" == engine_name:
+            print("Initializing coqui engine")
+            engines["coqui"] = CoquiEngine()
+        if "kokoro" == engine_name:
+            print("Initializing kokoro engine")
+            engines["kokoro"] = KokoroEngine()
+        if "openai" == engine_name:
+            print("Initializing openai engine")
+            engines["openai"] = OpenAIEngine()
+    for _engine in engines.keys():
+        print(f"Retrieving voices for TTS Engine {_engine}")
+        try:
+            voices[_engine] = engines[_engine].get_voices()
+        except Exception as e:
+            voices[_engine] = []
+            logging.error(f"Error retrieving voices for {_engine}: {str(e)}")
+    _set_engine(START_ENGINE)
+    print("Server ready")
+    uvicorn.run(app, host="0.0.0.0", port=PORT)