Spaces:

re-skill
/

tajik-tts

Running

File size: 7,910 Bytes

import gradio as gr
import httpx
import os
import atexit
from loguru import logger

# FastAPI endpoint URL - adjust this to match your actual endpoint
API_URL = os.getenv("API_URL").rstrip('/')

# Configure httpx client with retries and timeouts
client = httpx.Client(
    timeout=httpx.Timeout(
        connect=10.0,    # connection timeout
        read=120.0,      # read timeout
        write=10.0,      # write timeout
        pool=None,       # pool timeout
    ),
    limits=httpx.Limits(
        max_keepalive_connections=5,
        max_connections=10,
        keepalive_expiry=30.0
    ),
    transport=httpx.HTTPTransport(
        retries=3,  # Number of retries
    )
)

def check_api_health():
    """Check if the API is healthy before making requests"""
    try:
        response = client.get(f"{API_URL}/")
        response.raise_for_status()
        logger.info("API health check passed")
        return True
    except httpx.TimeoutException as e:
        logger.error(f"API health check timed out: {str(e)}")
        return False
    except httpx.HTTPError as e:
        logger.error(f"API health check failed: {str(e)}")
        return False

def generate_speech(text, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
    if not text.strip():
        logger.warning("Empty text input received")
        return None
    
    try:
        # Check API health first
        if not check_api_health():
            logger.error("API is not healthy, aborting request")
            raise gr.Error("The API service is currently unavailable. Please try again later.")

        # Log input parameters
        logger.info(f"Generating speech for text: {text[:50]}... with params: temp={temperature}, top_p={top_p}, rep_penalty={repetition_penalty}, max_tokens={max_new_tokens}")
        
        # Prepare the request payload
        payload = {
            "text": text.strip(),
            "return_type": "wav",  # Request WAV format directly
            "temperature": temperature,
            "top_p": top_p,
            "repetition_penalty": repetition_penalty,
            "max_new_tokens": max_new_tokens
        }
        
        # Update progress
        progress(0.3, "Sending request to server ...")
        
        # Make request to FastAPI endpoint
        response = client.post(
            f"{API_URL}/tts",
            json=payload,
            headers={"Content-Type": "application/json"}
        )
        
        # Log response status
        logger.debug(f"Received response with status {response.status_code} and content-type {response.headers.get('content-type')}")
        
        # Return the WAV bytes directly
        if response.status_code == 200:
            logger.info("Successfully generated speech in WAV format")
            return response.content
        else:
            error_msg = f"API returned error status {response.status_code}"
            logger.error(error_msg)
            raise gr.Error(error_msg)
    
    except httpx.TimeoutException as e:
        error_msg = "Request timed out. The server took too long to respond."
        logger.error(f"{error_msg}: {str(e)}")
        raise gr.Error(error_msg)
    except httpx.HTTPError as e:
        error_msg = f"Network error while generating speech: {str(e)}"
        logger.error(error_msg)
        raise gr.Error(error_msg)
    except Exception as e:
        error_msg = f"Error generating speech: {str(e)}"
        logger.error(error_msg, exc_info=True)
        raise gr.Error(error_msg)

# Clean up client on exit
atexit.register(client.close)

# Examples for the UI
examples = [
    [
        "Салом, номи ман Али аст ва ман имрӯз мехоҳам ба шумо дар бораи забони тоҷикӣ ва аҳамияти он дар фарҳанги мо нақл кунам.",
        0.6, 0.95, 1.1, 1800
    ],
    [
        "Имрӯз ҳаво хеле хуб аст ва ман қарор додам, ки бо дӯстонам ба боғ равам ва якҷоя вақт гузаронем.",
        0.6, 0.95, 1.1, 1200
    ],
    [
        "Ман забони тоҷикӣ меомӯзам, зеро мехоҳам бо мардумони гуногун сӯҳбат кунам ва фарҳанги онҳоро беҳтар фаҳмам.",
        0.6, 0.95, 1.1, 1200
    ],
    [
        "Лутфан як пиёла чой диҳед, зеро ман имрӯз хеле хаста шудам ва мехоҳам каме истироҳат кунам.",
        0.6, 0.95, 1.1, 1200
    ],
    [
        "Шумо аз куҷо ҳастед ва чӣ гуна ба омӯзиши забони тоҷикӣ шурӯъ кардед?",
        0.6, 0.95, 1.1, 1200
    ],
]

# Create Gradio interface
with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
    gr.Markdown("""
    # 🎵 [Tajik Orpheus Text-to-Speech](https://huggingface.co/re-skill/orpheus-tj-early)
    
    Enter your text below and hear it converted to natural-sounding speech with the Orpheus TTS model. 
    
    ## Tips for better prompts:
    - Short text prompts generally work better than very long phrases
    - Increasing `repetition_penalty` and `temperature` makes the model speak faster.

    ## Note:
    - This is demo of early checkpoint trained only on `35 Hours` of data.
    - The model was not fine-tuned on a specific voice. Hence, you will get different voices every time you run the model.
    
    """)    
    with gr.Row():
        with gr.Column(scale=3):
            text_input = gr.Textbox(
                label="Text to speak", 
                placeholder="Enter your text here...",
                lines=5
            )
            
            with gr.Accordion("Advanced Settings", open=False):
                temperature = gr.Slider(
                    minimum=0.1, maximum=1.5, value=0.6, step=0.05,
                    label="Temperature", 
                    info="Higher values (0.7-1.0) create more expressive but less stable speech"
                )
                top_p = gr.Slider(
                    minimum=0.1, maximum=1.0, value=0.95, step=0.05,
                    label="Top P", 
                    info="Nucleus sampling threshold"
                )
                repetition_penalty = gr.Slider(
                    minimum=1.0, maximum=2.0, value=1.1, step=0.05,
                    label="Repetition Penalty", 
                    info="Higher values discourage repetitive patterns"
                )
                max_new_tokens = gr.Slider(
                    minimum=100, maximum=2000, value=1200, step=100,
                    label="Max Length", 
                    info="Maximum length of generated audio (in tokens)"
                )
            
            with gr.Row():
                submit_btn = gr.Button("Generate Speech", variant="primary")
                clear_btn = gr.Button("Clear")
                
        with gr.Column(scale=2):
            # Audio component that can handle WAV bytes
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath"  # Changed from "auto" to "filepath" to handle WAV bytes
            )
            
    # Set up examples
    gr.Examples(
        examples=examples,
        inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
        outputs=audio_output,
        fn=generate_speech,
        cache_examples=False,
    )
    
    # Set up event handlers
    submit_btn.click(
        fn=generate_speech,
        inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
        outputs=audio_output
    )
    
    clear_btn.click(
        fn=lambda: (None, None),
        inputs=[],
        outputs=[text_input, audio_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.queue().launch(share=False, ssr_mode=False)