import gradio as gr import httpx import os import atexit from loguru import logger # FastAPI endpoint URL - adjust this to match your actual endpoint API_URL = os.getenv("API_URL").rstrip('/') # Configure httpx client with retries and timeouts client = httpx.Client( timeout=httpx.Timeout( connect=10.0, # connection timeout read=120.0, # read timeout write=10.0, # write timeout pool=None, # pool timeout ), limits=httpx.Limits( max_keepalive_connections=5, max_connections=10, keepalive_expiry=30.0 ), transport=httpx.HTTPTransport( retries=3, # Number of retries ) ) def check_api_health(): """Check if the API is healthy before making requests""" try: response = client.get(f"{API_URL}/") response.raise_for_status() logger.info("API health check passed") return True except httpx.TimeoutException as e: logger.error(f"API health check timed out: {str(e)}") return False except httpx.HTTPError as e: logger.error(f"API health check failed: {str(e)}") return False def generate_speech(text, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()): if not text.strip(): logger.warning("Empty text input received") return None try: # Check API health first if not check_api_health(): logger.error("API is not healthy, aborting request") raise gr.Error("The API service is currently unavailable. Please try again later.") # Log input parameters logger.info(f"Generating speech for text: {text[:50]}... with params: temp={temperature}, top_p={top_p}, rep_penalty={repetition_penalty}, max_tokens={max_new_tokens}") # Prepare the request payload payload = { "text": text.strip(), "return_type": "wav", # Request WAV format directly "temperature": temperature, "top_p": top_p, "repetition_penalty": repetition_penalty, "max_new_tokens": max_new_tokens } # Update progress progress(0.3, "Sending request to server ...") # Make request to FastAPI endpoint response = client.post( f"{API_URL}/tts", json=payload, headers={"Content-Type": "application/json"} ) # Log response status logger.debug(f"Received response with status {response.status_code} and content-type {response.headers.get('content-type')}") # Return the WAV bytes directly if response.status_code == 200: logger.info("Successfully generated speech in WAV format") return response.content else: error_msg = f"API returned error status {response.status_code}" logger.error(error_msg) raise gr.Error(error_msg) except httpx.TimeoutException as e: error_msg = "Request timed out. The server took too long to respond." logger.error(f"{error_msg}: {str(e)}") raise gr.Error(error_msg) except httpx.HTTPError as e: error_msg = f"Network error while generating speech: {str(e)}" logger.error(error_msg) raise gr.Error(error_msg) except Exception as e: error_msg = f"Error generating speech: {str(e)}" logger.error(error_msg, exc_info=True) raise gr.Error(error_msg) # Clean up client on exit atexit.register(client.close) # Examples for the UI examples = [ [ "Салом, номи ман Али аст ва ман имрӯз мехоҳам ба шумо дар бораи забони тоҷикӣ ва аҳамияти он дар фарҳанги мо нақл кунам.", 0.6, 0.95, 1.1, 1800 ], [ "Имрӯз ҳаво хеле хуб аст ва ман қарор додам, ки бо дӯстонам ба боғ равам ва якҷоя вақт гузаронем.", 0.6, 0.95, 1.1, 1200 ], [ "Ман забони тоҷикӣ меомӯзам, зеро мехоҳам бо мардумони гуногун сӯҳбат кунам ва фарҳанги онҳоро беҳтар фаҳмам.", 0.6, 0.95, 1.1, 1200 ], [ "Лутфан як пиёла чой диҳед, зеро ман имрӯз хеле хаста шудам ва мехоҳам каме истироҳат кунам.", 0.6, 0.95, 1.1, 1200 ], [ "Шумо аз куҷо ҳастед ва чӣ гуна ба омӯзиши забони тоҷикӣ шурӯъ кардед?", 0.6, 0.95, 1.1, 1200 ], ] # Create Gradio interface with gr.Blocks(title="Orpheus Text-to-Speech") as demo: gr.Markdown(""" # 🎵 [Tajik Orpheus Text-to-Speech](https://huggingface.co/re-skill/orpheus-tj-early) Enter your text below and hear it converted to natural-sounding speech with the Orpheus TTS model. ## Tips for better prompts: - Short text prompts generally work better than very long phrases - Increasing `repetition_penalty` and `temperature` makes the model speak faster. ## Note: - This is demo of early checkpoint trained only on `35 Hours` of data. - The model was not fine-tuned on a specific voice. Hence, you will get different voices every time you run the model. """) with gr.Row(): with gr.Column(scale=3): text_input = gr.Textbox( label="Text to speak", placeholder="Enter your text here...", lines=5 ) with gr.Accordion("Advanced Settings", open=False): temperature = gr.Slider( minimum=0.1, maximum=1.5, value=0.6, step=0.05, label="Temperature", info="Higher values (0.7-1.0) create more expressive but less stable speech" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P", info="Nucleus sampling threshold" ) repetition_penalty = gr.Slider( minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty", info="Higher values discourage repetitive patterns" ) max_new_tokens = gr.Slider( minimum=100, maximum=2000, value=1200, step=100, label="Max Length", info="Maximum length of generated audio (in tokens)" ) with gr.Row(): submit_btn = gr.Button("Generate Speech", variant="primary") clear_btn = gr.Button("Clear") with gr.Column(scale=2): # Audio component that can handle WAV bytes audio_output = gr.Audio( label="Generated Speech", type="filepath" # Changed from "auto" to "filepath" to handle WAV bytes ) # Set up examples gr.Examples( examples=examples, inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens], outputs=audio_output, fn=generate_speech, cache_examples=False, ) # Set up event handlers submit_btn.click( fn=generate_speech, inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens], outputs=audio_output ) clear_btn.click( fn=lambda: (None, None), inputs=[], outputs=[text_input, audio_output] ) # Launch the app if __name__ == "__main__": demo.queue().launch(share=False, ssr_mode=False)