Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on 25 days ago

Commit

d1e3c74

1 Parent(s): 94f2fb2

No code changes made.

Browse files

Files changed (1) hide show

app.py +330 -109

app.py CHANGED Viewed

@@ -5,114 +5,231 @@ import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
 load_dotenv()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print("Loading SNAC model...")
-snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-snac_model = snac_model.to(device)
-model_name = "mrrtmob/tts-khm-4"
-# Download only model config and safetensors
-snapshot_download(
-    repo_id=model_name,
-    allow_patterns=[
-        "config.json",
-        "*.safetensors",
-        "model.safetensors.index.json",
-    ],
-    ignore_patterns=[
-        "optimizer.pt",
-        "pytorch_model.bin",
-        "training_args.bin",
-        "scheduler.pt",
-        "tokenizer.json",
-        "tokenizer_config.json",
-        "special_tokens_map.json",
-        "vocab.json",
-        "merges.txt",
-        "tokenizer.*"
-    ]
-)
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
-model.to(device)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-print(f"Khmer TTS model loaded to {device}")
-# Process text prompt
-def process_prompt(prompt, voice, tokenizer, device):
-    prompt = f"{voice}: {prompt}"
-    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-    start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
-    end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human
-    modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)  # SOH SOT Text EOT EOH
-    # No padding needed for single input
-    attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
-# Parse output tokens to audio
 def parse_output(generated_ids):
     token_to_find = 128257
     token_to_remove = 128258
     token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
     if len(token_indices[1]) > 0:
         last_occurrence_idx = token_indices[1][-1].item()
         cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
     else:
         cropped_tensor = generated_ids
     processed_rows = []
     for row in cropped_tensor:
         masked_row = row[row != token_to_remove]
         processed_rows.append(masked_row)
     code_lists = []
     for row in processed_rows:
         row_length = row.size(0)
         new_length = (row_length // 7) * 7
         trimmed_row = row[:new_length]
-        trimmed_row = [t - 128266 for t in trimmed_row]
         code_lists.append(trimmed_row)
-    return code_lists[0]  # Return just the first one for single sample
-# Redistribute codes for audio generation
-def redistribute_codes(code_list, snac_model):
-    device = next(snac_model.parameters()).device  # Get the device of SNAC model
     layer_1 = []
     layer_2 = []
     layer_3 = []
-    for i in range((len(code_list)+1)//7):
-        layer_1.append(code_list[7*i])
-        layer_2.append(code_list[7*i+1]-4096)
-        layer_3.append(code_list[7*i+2]-(2*4096))
-        layer_3.append(code_list[7*i+3]-(3*4096))
-        layer_2.append(code_list[7*i+4]-(4*4096))
-        layer_3.append(code_list[7*i+5]-(5*4096))
-        layer_3.append(code_list[7*i+6]-(6*4096))
-    # Move tensors to the same device as the SNAC model
-    codes = [
-        torch.tensor(layer_1, device=device).unsqueeze(0),
-        torch.tensor(layer_2, device=device).unsqueeze(0),
-        torch.tensor(layer_3, device=device).unsqueeze(0)
-    ]
-    audio_hat = snac_model.decode(codes)
-    return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
-# Main generation function
-@spaces.GPU()
-def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200, voice="Elise", progress=gr.Progress()):
-    if not text.strip():
-        return None
     try:
-        progress(0.1, "Processing text...")
-        input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
-        progress(0.3, "Generating speech tokens...")
         with torch.no_grad():
             generated_ids = model.generate(
                 input_ids=input_ids,
@@ -124,77 +241,177 @@ def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, m
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
                 eos_token_id=128258,
             )
-        progress(0.6, "Processing speech tokens...")
         code_list = parse_output(generated_ids)
-        progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
-        return (24000, audio_samples)  # Return sample rate and audio
     except Exception as e:
         print(f"Error generating speech: {e}")
         return None
-# Examples for the UI - Khmer text examples
 examples = [
-    ["ជំរាបសួរ ខ្ញុំឈ្មោះ តារា ហើយខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។"],
-    ["ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច ឬ <sigh> ថប់ដង្ហើម។"],
-    ["ខ្ញុំរស់នៅក្នុងទីក្រុងភ្នំពេញ ហើយមានចរាចរណ៍ <gasp> ច្រើនណាស់។"],
-    ["ពេលខ្លះ ពេលខ្ញុំនិយាយច្រើ��ពេក ខ្ញុំត្រូវ <cough> សុំទោស។"],
-    ["ការនិយាយនៅចំពោះមុខសាធារណៈ អាចមានការពិបាក។ <groan> ប៉ុន្តែបើហាត់ គេអាចធ្វើបាន។"],
 ]
-# Available voices (commented out for simpler UI)
-# VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe", "jing", "Elise"]
-# Available Emotive Tags
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
 # Create Gradio interface
 with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
-    """)
     text_input = gr.Textbox(
-        label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
-        placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
-        lines=4
     )
-    # Voice selector (commented out)
-    # voice = gr.Dropdown(
-    #     choices=VOICES,
-    #     value="tara",
-    #     label="Voice (សំលេង)"
-    # )
     # Advanced Settings
     with gr.Accordion("🔧 Advanced Settings", open=False):
         with gr.Row():
             temperature = gr.Slider(
                 minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                label="Temperature",
                 info="Higher values create more expressive speech"
             )
             top_p = gr.Slider(
                 minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                label="Top P",
                 info="Nucleus sampling threshold"
             )
         with gr.Row():
             repetition_penalty = gr.Slider(
                 minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                label="Repetition Penalty",
                 info="Higher values discourage repetitive patterns"
             )
             max_new_tokens = gr.Slider(
-                minimum=100, maximum=2000, value=1200, step=100,
-                label="Max Length",
-                info="Maximum length of generated audio"
             )
     with gr.Row():
@@ -202,12 +419,11 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
         clear_btn = gr.Button("🗑️ Clear", size="lg")
     audio_output = gr.Audio(
-        label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
         type="numpy",
         show_label=True
     )
-    # Set up examples (NO CACHE)
     gr.Examples(
         examples=examples,
         inputs=[text_input],
@@ -216,10 +432,10 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
         cache_examples=False,
     )
-    # Set up event handlers
     submit_btn.click(
         fn=generate_speech,
-        inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output
     )
@@ -228,6 +444,11 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
         inputs=[],
         outputs=[text_input, audio_output]
     )
 # Launch the app
 if __name__ == "__main__":
-    demo.queue().launch(share=False)

 from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
+import os
+import re
+import numpy as np
 load_dotenv()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# Global variables to store models
+snac_model = None
+model = None
+tokenizer = None
+def load_models():
+    global snac_model, model, tokenizer
+    print("Loading SNAC model...")
+    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
+    snac_model = snac_model.to(device)
+    model_name = "mrrtmob/tts-khm-4"
+    # Download specific files
+    print("Downloading model files...")
+    snapshot_download(
+        repo_id=model_name,
+        allow_patterns=[
+            "config.json",
+            "*.safetensors",
+            "model.safetensors.index.json",
+            "tokenizer.json",
+            "tokenizer_config.json",
+            "special_tokens_map.json",
+            "vocab.json",
+            "merges.txt"
+        ],
+        ignore_patterns=[
+            "optimizer.pt",
+            "pytorch_model.bin",
+            "training_args.bin",
+            "scheduler.pt"
+        ]
+    )
+    print("Loading main model...")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.bfloat16,
+        device_map="auto" if device == "cuda" else None
+    )
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    print(f"Khmer TTS model loaded to {device}")
+# Load models at startup
+load_models()
+def split_text_by_punctuation(text, max_chars=200):
+    """Split text by punctuation marks, keeping sentences together when possible"""
+    # Khmer and common punctuation
+    sentence_endings = r'[។!?]'
+    clause_separators = r'[,;:]'
+    # First try to split by sentence endings
+    sentences = re.split(f'({sentence_endings})', text)
+    # Recombine sentences with their punctuation
+    combined_sentences = []
+    for i in range(0, len(sentences), 2):
+        sentence = sentences[i]
+        if i + 1 < len(sentences):
+            sentence += sentences[i + 1]  # Add the punctuation back
+        if sentence.strip():
+            combined_sentences.append(sentence.strip())
+    # If no sentence endings found, split by clauses
+    if len(combined_sentences) <= 1:
+        parts = re.split(f'({clause_separators})', text)
+        combined_sentences = []
+        for i in range(0, len(parts), 2):
+            part = parts[i]
+            if i + 1 < len(parts):
+                part += parts[i + 1]
+            if part.strip():
+                combined_sentences.append(part.strip())
+    # Further split if sentences are too long
+    final_chunks = []
+    for sentence in combined_sentences:
+        if len(sentence) <= max_chars:
+            final_chunks.append(sentence)
+        else:
+            # Split long sentences by words
+            words = sentence.split()
+            current_chunk = ""
+            for word in words:
+                test_chunk = current_chunk + " " + word if current_chunk else word
+                if len(test_chunk) <= max_chars:
+                    current_chunk = test_chunk
+                else:
+                    if current_chunk:
+                        final_chunks.append(current_chunk)
+                    current_chunk = word
+            if current_chunk:
+                final_chunks.append(current_chunk)
+    return [chunk for chunk in final_chunks if chunk.strip()]
+def split_text_by_tokens(text, max_tokens=150):
+    """Split text by token count"""
+    global tokenizer
+    # Tokenize the entire text first
+    tokens = tokenizer.encode(text)
+    if len(tokens) <= max_tokens:
+        return [text]
+    chunks = []
+    words = text.split()
+    current_chunk = ""
+    for word in words:
+        test_chunk = current_chunk + " " + word if current_chunk else word
+        test_tokens = tokenizer.encode(test_chunk)
+        if len(test_tokens) <= max_tokens:
+            current_chunk = test_chunk
+        else:
+            if current_chunk:
+                chunks.append(current_chunk)
+            current_chunk = word
+    if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
+def process_prompt(prompt, voice, tokenizer, device):
+    prompt = f"{voice}: {prompt}"
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    start_token = torch.tensor([[128259]], dtype=torch.int64)
+    end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
+    modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
+    attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
 def parse_output(generated_ids):
     token_to_find = 128257
     token_to_remove = 128258
     token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
     if len(token_indices[1]) > 0:
         last_occurrence_idx = token_indices[1][-1].item()
         cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
     else:
         cropped_tensor = generated_ids
     processed_rows = []
     for row in cropped_tensor:
         masked_row = row[row != token_to_remove]
         processed_rows.append(masked_row)
     code_lists = []
     for row in processed_rows:
         row_length = row.size(0)
         new_length = (row_length // 7) * 7
         trimmed_row = row[:new_length]
+        trimmed_row = [max(0, t - 128266) for t in trimmed_row]
         code_lists.append(trimmed_row)
+    return code_lists[0] if code_lists and len(code_lists[0]) > 0 else []
+def redistribute_codes(code_list, snac_model):
+    if not code_list or len(code_list) < 7:
+        return np.zeros(12000)  # 0.5 seconds of silence
+    device = next(snac_model.parameters()).device
     layer_1 = []
     layer_2 = []
     layer_3 = []
     try:
+        for i in range((len(code_list))//7):
+            layer_1.append(max(0, code_list[7*i]))
+            layer_2.append(max(0, code_list[7*i+1]-4096))
+            layer_3.append(max(0, code_list[7*i+2]-(2*4096)))
+            layer_3.append(max(0, code_list[7*i+3]-(3*4096)))
+            layer_2.append(max(0, code_list[7*i+4]-(4*4096)))
+            layer_3.append(max(0, code_list[7*i+5]-(5*4096)))
+            layer_3.append(max(0, code_list[7*i+6]-(6*4096)))
+        codes = [
+            torch.tensor(layer_1, device=device).unsqueeze(0),
+            torch.tensor(layer_2, device=device).unsqueeze(0),
+            torch.tensor(layer_3, device=device).unsqueeze(0)
+        ]
+        with torch.no_grad():
+            audio_hat = snac_model.decode(codes)
+        return audio_hat.detach().squeeze().cpu().numpy()
+    except Exception as e:
+        print(f"Error in redistribute_codes: {e}")
+        return np.zeros(12000)
+@spaces.GPU(duration=120)
+def generate_speech_chunk(text_chunk, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=800, voice="Elise"):
+    """Generate speech for a single chunk"""
+    global model, tokenizer, snac_model
+    if not text_chunk.strip():
+        return np.array([])
+    try:
+        input_ids, attention_mask = process_prompt(text_chunk, voice, tokenizer, device)
         with torch.no_grad():
             generated_ids = model.generate(
                 input_ids=input_ids,
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
                 eos_token_id=128258,
+                pad_token_id=tokenizer.eos_token_id,
+                use_cache=True
             )
         code_list = parse_output(generated_ids)
+        if not code_list:
+            return np.array([])
         audio_samples = redistribute_codes(code_list, snac_model)
+        return audio_samples
+    except Exception as e:
+        print(f"Error generating speech chunk: {e}")
+        return np.array([])
+def combine_audio_chunks(audio_chunks, pause_duration=0.3):
+    """Combine audio chunks with pauses between them"""
+    if not audio_chunks:
+        return np.array([])
+    # Create pause (silence)
+    pause_samples = int(24000 * pause_duration)  # 24kHz sample rate
+    pause = np.zeros(pause_samples)
+    combined_audio = []
+    for i, chunk in enumerate(audio_chunks):
+        if len(chunk) > 0:
+            combined_audio.append(chunk)
+            # Add pause between chunks (except after the last chunk)
+            if i < len(audio_chunks) - 1:
+                combined_audio.append(pause)
+    if combined_audio:
+        return np.concatenate(combined_audio)
+    else:
+        return np.array([])
+def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=800,
+                   voice="Elise", split_method="punctuation", max_chars=200, max_tokens=150,
+                   pause_duration=0.3, progress=gr.Progress()):
+    """Main function to generate speech with text splitting"""
+    if not text.strip():
+        return None
+    try:
+        # Split text based on selected method
+        progress(0.05, "Splitting text...")
+        if split_method == "punctuation":
+            text_chunks = split_text_by_punctuation(text, max_chars)
+        elif split_method == "tokens":
+            text_chunks = split_text_by_tokens(text, max_tokens)
+        else:  # "none"
+            text_chunks = [text]
+        progress(0.1, f"Processing {len(text_chunks)} chunks...")
+        print(f"Split text into {len(text_chunks)} chunks:")
+        for i, chunk in enumerate(text_chunks):
+            print(f"Chunk {i+1}: {chunk[:50]}...")
+        # Generate audio for each chunk
+        audio_chunks = []
+        for i, chunk in enumerate(text_chunks):
+            progress(0.1 + 0.7 * (i / len(text_chunks)), f"Generating chunk {i+1}/{len(text_chunks)}...")
+            audio = generate_speech_chunk(
+                chunk, temperature, top_p, repetition_penalty, max_new_tokens, voice
+            )
+            if len(audio) > 0:
+                audio_chunks.append(audio)
+                print(f"Generated audio for chunk {i+1}: {len(audio)} samples ({len(audio)/24000:.2f}s)")
+        if not audio_chunks:
+            return None
+        # Combine all audio chunks
+        progress(0.9, "Combining audio chunks...")
+        final_audio = combine_audio_chunks(audio_chunks, pause_duration)
+        progress(1.0, "Complete!")
+        print(f"Final audio: {len(final_audio)} samples ({len(final_audio)/24000:.2f}s)")
+        return (24000, final_audio)
     except Exception as e:
         print(f"Error generating speech: {e}")
+        import traceback
+        traceback.print_exc()
         return None
+# Examples
 examples = [
+    ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ។ ខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។ ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច ឬ <sigh> ថប់ដង្ហើម។ ខ្ញុំរស់នៅក្នុងទីក្រុងភ្នំពេញ ហើយមានចរាចរណ៍ <gasp> ច្រើនណាស់។"],
+    ["ការនិយាយនៅចំពោះមុខសាធារណៈ អាចមានការពិបាក។ <groan> ប៉ុន្តែបើហាត់ គេអាចធ្វើបាន។ ពេលខ្លះ ពេលខ្ញុំនិយាយច្រើនពេក ខ្ញុំត្រូវ <cough> សុំទោស។ វាជារឿងធម្មតា។"],
 ]
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
 # Create Gradio interface
 with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
+    ✨ **New**: Supports long text with automatic splitting!
+    """)
     text_input = gr.Textbox(
+        label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
+        placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ... (អាចវែងបាន)",
+        lines=6
     )
+    # Text splitting options
+    with gr.Accordion("📝 Text Splitting Options", open=True):
+        split_method = gr.Radio(
+            choices=[
+                ("Split by punctuation (recommended)", "punctuation"),
+                ("Split by token count", "tokens"),
+                ("No splitting", "none")
+            ],
+            value="punctuation",
+            label="Text splitting method",
+            info="For long texts, splitting helps avoid the 15s limit"
+        )
+        with gr.Row():
+            max_chars = gr.Slider(
+                minimum=50, maximum=500, value=200, step=25,
+                label="Max characters per chunk (punctuation mode)",
+                info="Shorter chunks = more natural breaks but more processing time"
+            )
+            max_tokens = gr.Slider(
+                minimum=50, maximum=300, value=150, step=25,
+                label="Max tokens per chunk (token mode)",
+                info="Controls chunk size based on model tokenization"
+            )
+        pause_duration = gr.Slider(
+            minimum=0.0, maximum=1.0, value=0.3, step=0.1,
+            label="Pause between chunks (seconds)",
+            info="Silence duration between text chunks"
+        )
     # Advanced Settings
     with gr.Accordion("🔧 Advanced Settings", open=False):
         with gr.Row():
             temperature = gr.Slider(
                 minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                label="Temperature",
                 info="Higher values create more expressive speech"
             )
             top_p = gr.Slider(
                 minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                label="Top P",
                 info="Nucleus sampling threshold"
             )
         with gr.Row():
             repetition_penalty = gr.Slider(
                 minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                label="Repetition Penalty",
                 info="Higher values discourage repetitive patterns"
             )
             max_new_tokens = gr.Slider(
+                minimum=100, maximum=1200, value=800, step=100,
+                label="Max tokens per chunk",
+                info="Lower values for shorter, more reliable generation"
             )
     with gr.Row():
         clear_btn = gr.Button("🗑️ Clear", size="lg")
     audio_output = gr.Audio(
+        label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
         type="numpy",
         show_label=True
     )
     gr.Examples(
         examples=examples,
         inputs=[text_input],
         cache_examples=False,
     )
     submit_btn.click(
         fn=generate_speech,
+        inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens,
+                gr.State("Elise"), split_method, max_chars, max_tokens, pause_duration],
         outputs=audio_output
     )
         inputs=[],
         outputs=[text_input, audio_output]
     )
 # Launch the app
 if __name__ == "__main__":
+    demo.queue(max_size=10).launch(
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7860
+    )