|
import gradio as gr |
|
import httpx |
|
import os |
|
import atexit |
|
from loguru import logger |
|
|
|
|
|
API_URL = os.getenv("API_URL").rstrip('/') |
|
|
|
|
|
client = httpx.Client( |
|
timeout=httpx.Timeout( |
|
connect=10.0, |
|
read=120.0, |
|
write=10.0, |
|
pool=None, |
|
), |
|
limits=httpx.Limits( |
|
max_keepalive_connections=5, |
|
max_connections=10, |
|
keepalive_expiry=30.0 |
|
), |
|
transport=httpx.HTTPTransport( |
|
retries=3, |
|
) |
|
) |
|
|
|
def check_api_health(): |
|
"""Check if the API is healthy before making requests""" |
|
try: |
|
response = client.get(f"{API_URL}/") |
|
response.raise_for_status() |
|
logger.info("API health check passed") |
|
return True |
|
except httpx.TimeoutException as e: |
|
logger.error(f"API health check timed out: {str(e)}") |
|
return False |
|
except httpx.HTTPError as e: |
|
logger.error(f"API health check failed: {str(e)}") |
|
return False |
|
|
|
def generate_speech(text, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()): |
|
if not text.strip(): |
|
logger.warning("Empty text input received") |
|
return None |
|
|
|
try: |
|
|
|
if not check_api_health(): |
|
logger.error("API is not healthy, aborting request") |
|
raise gr.Error("The API service is currently unavailable. Please try again later.") |
|
|
|
|
|
logger.info(f"Generating speech for text: {text[:50]}... with params: temp={temperature}, top_p={top_p}, rep_penalty={repetition_penalty}, max_tokens={max_new_tokens}") |
|
|
|
|
|
payload = { |
|
"text": text.strip(), |
|
"return_type": "wav", |
|
"temperature": temperature, |
|
"top_p": top_p, |
|
"repetition_penalty": repetition_penalty, |
|
"max_new_tokens": max_new_tokens |
|
} |
|
|
|
|
|
progress(0.3, "Sending request to server ...") |
|
|
|
|
|
response = client.post( |
|
f"{API_URL}/tts", |
|
json=payload, |
|
headers={"Content-Type": "application/json"} |
|
) |
|
|
|
|
|
logger.debug(f"Received response with status {response.status_code} and content-type {response.headers.get('content-type')}") |
|
|
|
|
|
if response.status_code == 200: |
|
logger.info("Successfully generated speech in WAV format") |
|
return response.content |
|
else: |
|
error_msg = f"API returned error status {response.status_code}" |
|
logger.error(error_msg) |
|
raise gr.Error(error_msg) |
|
|
|
except httpx.TimeoutException as e: |
|
error_msg = "Request timed out. The server took too long to respond." |
|
logger.error(f"{error_msg}: {str(e)}") |
|
raise gr.Error(error_msg) |
|
except httpx.HTTPError as e: |
|
error_msg = f"Network error while generating speech: {str(e)}" |
|
logger.error(error_msg) |
|
raise gr.Error(error_msg) |
|
except Exception as e: |
|
error_msg = f"Error generating speech: {str(e)}" |
|
logger.error(error_msg, exc_info=True) |
|
raise gr.Error(error_msg) |
|
|
|
|
|
atexit.register(client.close) |
|
|
|
|
|
examples = [ |
|
[ |
|
"Салом, номи ман Али аст ва ман имрӯз мехоҳам ба шумо дар бораи забони тоҷикӣ ва аҳамияти он дар фарҳанги мо нақл кунам.", |
|
0.6, 0.95, 1.1, 1800 |
|
], |
|
[ |
|
"Имрӯз ҳаво хеле хуб аст ва ман қарор додам, ки бо дӯстонам ба боғ равам ва якҷоя вақт гузаронем.", |
|
0.6, 0.95, 1.1, 1200 |
|
], |
|
[ |
|
"Ман забони тоҷикӣ меомӯзам, зеро мехоҳам бо мардумони гуногун сӯҳбат кунам ва фарҳанги онҳоро беҳтар фаҳмам.", |
|
0.6, 0.95, 1.1, 1200 |
|
], |
|
[ |
|
"Лутфан як пиёла чой диҳед, зеро ман имрӯз хеле хаста шудам ва мехоҳам каме истироҳат кунам.", |
|
0.6, 0.95, 1.1, 1200 |
|
], |
|
[ |
|
"Шумо аз куҷо ҳастед ва чӣ гуна ба омӯзиши забони тоҷикӣ шурӯъ кардед?", |
|
0.6, 0.95, 1.1, 1200 |
|
], |
|
] |
|
|
|
|
|
with gr.Blocks(title="Orpheus Text-to-Speech") as demo: |
|
gr.Markdown(""" |
|
# 🎵 [Tajik Orpheus Text-to-Speech](https://huggingface.co/re-skill/orpheus-tj-early) |
|
|
|
Enter your text below and hear it converted to natural-sounding speech with the Orpheus TTS model. |
|
|
|
## Tips for better prompts: |
|
- Short text prompts generally work better than very long phrases |
|
- Increasing `repetition_penalty` and `temperature` makes the model speak faster. |
|
|
|
## Note: |
|
- This is demo of early checkpoint trained only on `35 Hours` of data. |
|
- The model was not fine-tuned on a specific voice. Hence, you will get different voices every time you run the model. |
|
|
|
""") |
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
text_input = gr.Textbox( |
|
label="Text to speak", |
|
placeholder="Enter your text here...", |
|
lines=5 |
|
) |
|
|
|
with gr.Accordion("Advanced Settings", open=False): |
|
temperature = gr.Slider( |
|
minimum=0.1, maximum=1.5, value=0.6, step=0.05, |
|
label="Temperature", |
|
info="Higher values (0.7-1.0) create more expressive but less stable speech" |
|
) |
|
top_p = gr.Slider( |
|
minimum=0.1, maximum=1.0, value=0.95, step=0.05, |
|
label="Top P", |
|
info="Nucleus sampling threshold" |
|
) |
|
repetition_penalty = gr.Slider( |
|
minimum=1.0, maximum=2.0, value=1.1, step=0.05, |
|
label="Repetition Penalty", |
|
info="Higher values discourage repetitive patterns" |
|
) |
|
max_new_tokens = gr.Slider( |
|
minimum=100, maximum=2000, value=1200, step=100, |
|
label="Max Length", |
|
info="Maximum length of generated audio (in tokens)" |
|
) |
|
|
|
with gr.Row(): |
|
submit_btn = gr.Button("Generate Speech", variant="primary") |
|
clear_btn = gr.Button("Clear") |
|
|
|
with gr.Column(scale=2): |
|
|
|
audio_output = gr.Audio( |
|
label="Generated Speech", |
|
type="filepath" |
|
) |
|
|
|
|
|
gr.Examples( |
|
examples=examples, |
|
inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens], |
|
outputs=audio_output, |
|
fn=generate_speech, |
|
cache_examples=False, |
|
) |
|
|
|
|
|
submit_btn.click( |
|
fn=generate_speech, |
|
inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens], |
|
outputs=audio_output |
|
) |
|
|
|
clear_btn.click( |
|
fn=lambda: (None, None), |
|
inputs=[], |
|
outputs=[text_input, audio_output] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.queue().launch(share=False, ssr_mode=False) |
|
|