tajik-tts / app.py
muhtasham's picture
Update app.py
ac04834 verified
import gradio as gr
import httpx
import os
import atexit
from loguru import logger
# FastAPI endpoint URL - adjust this to match your actual endpoint
API_URL = os.getenv("API_URL").rstrip('/')
# Configure httpx client with retries and timeouts
client = httpx.Client(
timeout=httpx.Timeout(
connect=10.0, # connection timeout
read=120.0, # read timeout
write=10.0, # write timeout
pool=None, # pool timeout
),
limits=httpx.Limits(
max_keepalive_connections=5,
max_connections=10,
keepalive_expiry=30.0
),
transport=httpx.HTTPTransport(
retries=3, # Number of retries
)
)
def check_api_health():
"""Check if the API is healthy before making requests"""
try:
response = client.get(f"{API_URL}/")
response.raise_for_status()
logger.info("API health check passed")
return True
except httpx.TimeoutException as e:
logger.error(f"API health check timed out: {str(e)}")
return False
except httpx.HTTPError as e:
logger.error(f"API health check failed: {str(e)}")
return False
def generate_speech(text, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
if not text.strip():
logger.warning("Empty text input received")
return None
try:
# Check API health first
if not check_api_health():
logger.error("API is not healthy, aborting request")
raise gr.Error("The API service is currently unavailable. Please try again later.")
# Log input parameters
logger.info(f"Generating speech for text: {text[:50]}... with params: temp={temperature}, top_p={top_p}, rep_penalty={repetition_penalty}, max_tokens={max_new_tokens}")
# Prepare the request payload
payload = {
"text": text.strip(),
"return_type": "wav", # Request WAV format directly
"temperature": temperature,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"max_new_tokens": max_new_tokens
}
# Update progress
progress(0.3, "Sending request to server ...")
# Make request to FastAPI endpoint
response = client.post(
f"{API_URL}/tts",
json=payload,
headers={"Content-Type": "application/json"}
)
# Log response status
logger.debug(f"Received response with status {response.status_code} and content-type {response.headers.get('content-type')}")
# Return the WAV bytes directly
if response.status_code == 200:
logger.info("Successfully generated speech in WAV format")
return response.content
else:
error_msg = f"API returned error status {response.status_code}"
logger.error(error_msg)
raise gr.Error(error_msg)
except httpx.TimeoutException as e:
error_msg = "Request timed out. The server took too long to respond."
logger.error(f"{error_msg}: {str(e)}")
raise gr.Error(error_msg)
except httpx.HTTPError as e:
error_msg = f"Network error while generating speech: {str(e)}"
logger.error(error_msg)
raise gr.Error(error_msg)
except Exception as e:
error_msg = f"Error generating speech: {str(e)}"
logger.error(error_msg, exc_info=True)
raise gr.Error(error_msg)
# Clean up client on exit
atexit.register(client.close)
# Examples for the UI
examples = [
[
"Салом, номи ман Али аст ва ман имрӯз мехоҳам ба шумо дар бораи забони тоҷикӣ ва аҳамияти он дар фарҳанги мо нақл кунам.",
0.6, 0.95, 1.1, 1800
],
[
"Имрӯз ҳаво хеле хуб аст ва ман қарор додам, ки бо дӯстонам ба боғ равам ва якҷоя вақт гузаронем.",
0.6, 0.95, 1.1, 1200
],
[
"Ман забони тоҷикӣ меомӯзам, зеро мехоҳам бо мардумони гуногун сӯҳбат кунам ва фарҳанги онҳоро беҳтар фаҳмам.",
0.6, 0.95, 1.1, 1200
],
[
"Лутфан як пиёла чой диҳед, зеро ман имрӯз хеле хаста шудам ва мехоҳам каме истироҳат кунам.",
0.6, 0.95, 1.1, 1200
],
[
"Шумо аз куҷо ҳастед ва чӣ гуна ба омӯзиши забони тоҷикӣ шурӯъ кардед?",
0.6, 0.95, 1.1, 1200
],
]
# Create Gradio interface
with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
gr.Markdown("""
# 🎵 [Tajik Orpheus Text-to-Speech](https://huggingface.co/re-skill/orpheus-tj-early)
Enter your text below and hear it converted to natural-sounding speech with the Orpheus TTS model.
## Tips for better prompts:
- Short text prompts generally work better than very long phrases
- Increasing `repetition_penalty` and `temperature` makes the model speak faster.
## Note:
- This is demo of early checkpoint trained only on `35 Hours` of data.
- The model was not fine-tuned on a specific voice. Hence, you will get different voices every time you run the model.
""")
with gr.Row():
with gr.Column(scale=3):
text_input = gr.Textbox(
label="Text to speak",
placeholder="Enter your text here...",
lines=5
)
with gr.Accordion("Advanced Settings", open=False):
temperature = gr.Slider(
minimum=0.1, maximum=1.5, value=0.6, step=0.05,
label="Temperature",
info="Higher values (0.7-1.0) create more expressive but less stable speech"
)
top_p = gr.Slider(
minimum=0.1, maximum=1.0, value=0.95, step=0.05,
label="Top P",
info="Nucleus sampling threshold"
)
repetition_penalty = gr.Slider(
minimum=1.0, maximum=2.0, value=1.1, step=0.05,
label="Repetition Penalty",
info="Higher values discourage repetitive patterns"
)
max_new_tokens = gr.Slider(
minimum=100, maximum=2000, value=1200, step=100,
label="Max Length",
info="Maximum length of generated audio (in tokens)"
)
with gr.Row():
submit_btn = gr.Button("Generate Speech", variant="primary")
clear_btn = gr.Button("Clear")
with gr.Column(scale=2):
# Audio component that can handle WAV bytes
audio_output = gr.Audio(
label="Generated Speech",
type="filepath" # Changed from "auto" to "filepath" to handle WAV bytes
)
# Set up examples
gr.Examples(
examples=examples,
inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
outputs=audio_output,
fn=generate_speech,
cache_examples=False,
)
# Set up event handlers
submit_btn.click(
fn=generate_speech,
inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
outputs=audio_output
)
clear_btn.click(
fn=lambda: (None, None),
inputs=[],
outputs=[text_input, audio_output]
)
# Launch the app
if __name__ == "__main__":
demo.queue().launch(share=False, ssr_mode=False)