Spaces:

mrrtmob
/

khmer-tts

Running on Zero

App Files Files Community

mrrtmob commited on 27 days ago

Commit

8bdeb28

1 Parent(s): 6c2bc94

kore

Browse files

Files changed (1) hide show

app.py +114 -335

app.py CHANGED Viewed

@@ -3,268 +3,116 @@ from snac import SNAC
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import snapshot_download, login
 from dotenv import load_dotenv
-import os
-import re
-import numpy as np
 load_dotenv()
-# Setup Hugging Face authentication
-def setup_auth():
-    hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
-    if hf_token:
-        try:
-            login(token=hf_token, add_to_git_credential=False)
-            print("✅ Successfully logged in to Hugging Face")
-            return True
-        except Exception as e:
-            print(f"⚠️ Failed to login to Hugging Face: {e}")
-            return False
-    else:
-        print("⚠️ No HF token found. Running as anonymous user.")
-        return False
-# Setup authentication before anything else
-auth_success = setup_auth()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
-print(f"Authentication status: {'✅ Logged in' if auth_success else '❌ Anonymous'}")
-# Global variables to store models
-snac_model = None
-model = None
-tokenizer = None
-def load_models():
-    global snac_model, model, tokenizer
-    print("Loading SNAC model...")
-    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-    snac_model = snac_model.to(device)
-    model_name = "mrrtmob/tts-khm-4"
-    print("Downloading model files...")
-    snapshot_download(
-        repo_id=model_name,
-        allow_patterns=[
-            "config.json",
-            "*.safetensors",
-            "model.safetensors.index.json",
-            "tokenizer.json",
-            "tokenizer_config.json",
-            "special_tokens_map.json",
-            "vocab.json",
-            "merges.txt"
-        ],
-        ignore_patterns=[
-            "optimizer.pt",
-            "pytorch_model.bin",
-            "training_args.bin",
-            "scheduler.pt"
-        ]
-    )
-    print("Loading main model...")
-    if device == "cuda":
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.bfloat16,
-            low_cpu_mem_usage=True
-        )
-        model = model.to(device)
-    else:
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.float32
-        )
-    print("Loading tokenizer...")
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    print(f"Khmer TTS model loaded to {device}")
-# Load models at startup
-load_models()
-def split_text_by_punctuation(text, max_chars=200):
-    sentence_endings = r'[។!?]'
-    clause_separators = r'[,;:]'
-    sentences = re.split(f'({sentence_endings})', text)
-    combined_sentences = []
-    for i in range(0, len(sentences), 2):
-        sentence = sentences[i]
-        if i + 1 < len(sentences):
-            sentence += sentences[i + 1]
-        if sentence.strip():
-            combined_sentences.append(sentence.strip())
-    if len(combined_sentences) <= 1:
-        parts = re.split(f'({clause_separators})', text)
-        combined_sentences = []
-        for i in range(0, len(parts), 2):
-            part = parts[i]
-            if i + 1 < len(parts):
-                part += parts[i + 1]
-            if part.strip():
-                combined_sentences.append(part.strip())
-    final_chunks = []
-    for sentence in combined_sentences:
-        if len(sentence) <= max_chars:
-            final_chunks.append(sentence)
-        else:
-            words = sentence.split()
-            current_chunk = ""
-            for word in words:
-                test_chunk = current_chunk + " " + word if current_chunk else word
-                if len(test_chunk) <= max_chars:
-                    current_chunk = test_chunk
-                else:
-                    if current_chunk:
-                        final_chunks.append(current_chunk)
-                    current_chunk = word
-            if current_chunk:
-                final_chunks.append(current_chunk)
-    return [chunk for chunk in final_chunks if chunk.strip()]
-def split_text_by_tokens(text, max_tokens=150):
-    global tokenizer
-    tokens = tokenizer.encode(text)
-    if len(tokens) <= max_tokens:
-        return [text]
-    chunks = []
-    words = text.split()
-    current_chunk = ""
-    for word in words:
-        test_chunk = current_chunk + " " + word if current_chunk else word
-        test_tokens = tokenizer.encode(test_chunk)
-        if len(test_tokens) <= max_tokens:
-            current_chunk = test_chunk
-        else:
-            if current_chunk:
-                chunks.append(current_chunk)
-            current_chunk = word
-    if current_chunk:
-        chunks.append(current_chunk)
-    return chunks
 def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-    start_token = torch.tensor([[128259]], dtype=torch.int64)
-    end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
-    modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
     attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
 def parse_output(generated_ids):
     token_to_find = 128257
     token_to_remove = 128258
-    token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
     if len(token_indices[1]) > 0:
         last_occurrence_idx = token_indices[1][-1].item()
         cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
     else:
         cropped_tensor = generated_ids
     processed_rows = []
     for row in cropped_tensor:
         masked_row = row[row != token_to_remove]
         processed_rows.append(masked_row)
     code_lists = []
     for row in processed_rows:
         row_length = row.size(0)
         new_length = (row_length // 7) * 7
         trimmed_row = row[:new_length]
-        trimmed_row = [max(0, t - 128266) for t in trimmed_row]
         code_lists.append(trimmed_row)
-    return code_lists[0] if code_lists and len(code_lists[0]) > 0 else []
-def redistribute_codes(code_list, snac_model):
-    if not code_list or len(code_list) < 7:
-        return np.zeros(12000)
-    device = next(snac_model.parameters()).device
     layer_1 = []
     layer_2 = []
     layer_3 = []
     try:
-        for i in range((len(code_list))//7):
-            layer_1.append(max(0, code_list[7*i]))
-            layer_2.append(max(0, code_list[7*i+1]-4096))
-            layer_3.append(max(0, code_list[7*i+2]-(2*4096)))
-            layer_3.append(max(0, code_list[7*i+3]-(3*4096)))
-            layer_2.append(max(0, code_list[7*i+4]-(4*4096)))
-            layer_3.append(max(0, code_list[7*i+5]-(5*4096)))
-            layer_3.append(max(0, code_list[7*i+6]-(6*4096)))
-        codes = [
-            torch.tensor(layer_1, device=device).unsqueeze(0),
-            torch.tensor(layer_2, device=device).unsqueeze(0),
-            torch.tensor(layer_3, device=device).unsqueeze(0)
-        ]
-        with torch.no_grad():
-            audio_hat = snac_model.decode(codes)
-        return audio_hat.detach().squeeze().cpu().numpy()
-    except Exception as e:
-        print(f"Error in redistribute_codes: {e}")
-        return np.zeros(12000)
-def combine_audio_chunks(audio_chunks, pause_duration=0.3):
-    if not audio_chunks:
-        return np.array([])
-    pause_samples = int(24000 * pause_duration)
-    pause = np.zeros(pause_samples)
-    combined_audio = []
-    for i, chunk in enumerate(audio_chunks):
-        if len(chunk) > 0:
-            combined_audio.append(chunk)
-            if i < len(audio_chunks) - 1:
-                combined_audio.append(pause)
-    if combined_audio:
-        return np.concatenate(combined_audio)
-    else:
-        return np.array([])
-@spaces.GPU(duration=60)  # Reduced duration to be more conservative
-def generate_speech_chunk(text_chunk, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=600, voice="Elise"):
-    """Generate speech for a single chunk"""
-    global model, tokenizer, snac_model
-    if not text_chunk.strip():
-        return np.array([])
-    try:
-        input_ids, attention_mask = process_prompt(text_chunk, voice, tokenizer, device)
         with torch.no_grad():
             generated_ids = model.generate(
                 input_ids=input_ids,
@@ -276,143 +124,77 @@ def generate_speech_chunk(text_chunk, temperature=0.6, top_p=0.95, repetition_pe
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
                 eos_token_id=128258,
-                pad_token_id=tokenizer.eos_token_id,
-                use_cache=True
             )
         code_list = parse_output(generated_ids)
-        if not code_list:
-            return np.array([])
         audio_samples = redistribute_codes(code_list, snac_model)
-        return audio_samples
-    except Exception as e:
-        print(f"Error generating speech chunk: {e}")
-        return np.array([])
-def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=600,
-                   voice="Elise", split_method="punctuation", max_chars=150, max_tokens=100,
-                   pause_duration=0.3, progress=gr.Progress()):
-    """Main function to generate speech with text splitting"""
-    if not text.strip():
-        return None
-    try:
-        progress(0.05, "Splitting text...")
-        if split_method == "punctuation":
-            text_chunks = split_text_by_punctuation(text, max_chars)
-        elif split_method == "tokens":
-            text_chunks = split_text_by_tokens(text, max_tokens)
-        else:
-            text_chunks = [text]
-        progress(0.1, f"Processing {len(text_chunks)} chunks...")
-        print(f"Split text into {len(text_chunks)} chunks:")
-        for i, chunk in enumerate(text_chunks):
-            print(f"Chunk {i+1}: {chunk[:50]}...")
-        audio_chunks = []
-        for i, chunk in enumerate(text_chunks):
-            progress(0.1 + 0.7 * (i / len(text_chunks)), f"Generating chunk {i+1}/{len(text_chunks)}...")
-            audio = generate_speech_chunk(
-                chunk, temperature, top_p, repetition_penalty, max_new_tokens, voice
-            )
-            if len(audio) > 0:
-                audio_chunks.append(audio)
-                print(f"Generated audio for chunk {i+1}: {len(audio)} samples ({len(audio)/24000:.2f}s)")
-        if not audio_chunks:
-            return None
-        progress(0.9, "Combining audio chunks...")
-        final_audio = combine_audio_chunks(audio_chunks, pause_duration)
-        progress(1.0, "Complete!")
-        print(f"Final audio: {len(final_audio)} samples ({len(final_audio)/24000:.2f}s)")
-        return (24000, final_audio)
     except Exception as e:
         print(f"Error generating speech: {e}")
-        import traceback
-        traceback.print_exc()
         return None
-# [Rest of your Gradio interface code remains the same]
 examples = [
-    ["ជំរាបសួរ ខ្ញុំឈ្មោះ តារា។ ខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។"],
     ["ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច ឬ <sigh> ថប់ដង្ហើម។"],
 ]
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
 with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
-    Authentication: {'✅ Pro Account' if auth_success else '❌ Anonymous (Limited)'}
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
-    ✨ **New**: Supports long text with automatic splitting!
-    """)
     text_input = gr.Textbox(
-        label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
-        placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ... (អាចវែងបាន)",
-        lines=6
     )
-    with gr.Accordion("📝 Text Splitting Options", open=True):
-        split_method = gr.Radio(
-            choices=[
-                ("Split by punctuation (recommended)", "punctuation"),
-                ("Split by token count", "tokens"),
-                ("No splitting", "none")
-            ],
-            value="punctuation",
-            label="Text splitting method"
-        )
-        with gr.Row():
-            max_chars = gr.Slider(
-                minimum=50, maximum=300, value=150, step=25,
-                label="Max characters per chunk"
-            )
-            max_tokens = gr.Slider(
-                minimum=50, maximum=200, value=100, step=25,
-                label="Max tokens per chunk"
-            )
-        pause_duration = gr.Slider(
-            minimum=0.0, maximum=1.0, value=0.3, step=0.1,
-            label="Pause between chunks (seconds)"
-        )
     with gr.Accordion("🔧 Advanced Settings", open=False):
         with gr.Row():
             temperature = gr.Slider(
                 minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                label="Temperature"
             )
             top_p = gr.Slider(
                 minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                label="Top P"
             )
         with gr.Row():
             repetition_penalty = gr.Slider(
                 minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                label="Repetition Penalty"
             )
             max_new_tokens = gr.Slider(
-                minimum=100, maximum=800, value=600, step=100,
-                label="Max tokens per chunk"
             )
     with gr.Row():
@@ -420,11 +202,12 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
         clear_btn = gr.Button("🗑️ Clear", size="lg")
     audio_output = gr.Audio(
-        label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
         type="numpy",
         show_label=True
     )
     gr.Examples(
         examples=examples,
         inputs=[text_input],
@@ -433,10 +216,10 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
         cache_examples=False,
     )
     submit_btn.click(
         fn=generate_speech,
-        inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens,
-                gr.State("Elise"), split_method, max_chars, max_tokens, pause_duration],
         outputs=audio_output
     )
@@ -445,10 +228,6 @@ with gr.Blocks(title="Khmer Text-to-Speech") as demo:
         inputs=[],
         outputs=[text_input, audio_output]
     )
 if __name__ == "__main__":
-    demo.queue(max_size=5).launch(
-        share=False,
-        server_name="0.0.0.0",
-        server_port=7860
-    )

 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
 load_dotenv()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print("Loading SNAC model...")
+snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
+snac_model = snac_model.to(device)
+model_name = "mrrtmob/tts-khm-kore"
+# Download only model config and safetensors
+snapshot_download(
+    repo_id=model_name,
+    allow_patterns=[
+        "config.json",
+        "*.safetensors",
+        "model.safetensors.index.json",
+    ],
+    ignore_patterns=[
+        "optimizer.pt",
+        "pytorch_model.bin",
+        "training_args.bin",
+        "scheduler.pt",
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt",
+        "tokenizer.*"
+    ]
+)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
+model.to(device)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+print(f"Khmer TTS model loaded to {device}")
+# Process text prompt
 def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
+    end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human
+    modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)  # SOH SOT Text EOT EOH
+    # No padding needed for single input
     attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
+# Parse output tokens to audio
 def parse_output(generated_ids):
     token_to_find = 128257
     token_to_remove = 128258
+    token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
     if len(token_indices[1]) > 0:
         last_occurrence_idx = token_indices[1][-1].item()
         cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
     else:
         cropped_tensor = generated_ids
     processed_rows = []
     for row in cropped_tensor:
         masked_row = row[row != token_to_remove]
         processed_rows.append(masked_row)
     code_lists = []
     for row in processed_rows:
         row_length = row.size(0)
         new_length = (row_length // 7) * 7
         trimmed_row = row[:new_length]
+        trimmed_row = [t - 128266 for t in trimmed_row]
         code_lists.append(trimmed_row)
+    return code_lists[0]  # Return just the first one for single sample
+# Redistribute codes for audio generation
+def redistribute_codes(code_list, snac_model):
+    device = next(snac_model.parameters()).device  # Get the device of SNAC model
     layer_1 = []
     layer_2 = []
     layer_3 = []
+    for i in range((len(code_list)+1)//7):
+        layer_1.append(code_list[7*i])
+        layer_2.append(code_list[7*i+1]-4096)
+        layer_3.append(code_list[7*i+2]-(2*4096))
+        layer_3.append(code_list[7*i+3]-(3*4096))
+        layer_2.append(code_list[7*i+4]-(4*4096))
+        layer_3.append(code_list[7*i+5]-(5*4096))
+        layer_3.append(code_list[7*i+6]-(6*4096))
+    # Move tensors to the same device as the SNAC model
+    codes = [
+        torch.tensor(layer_1, device=device).unsqueeze(0),
+        torch.tensor(layer_2, device=device).unsqueeze(0),
+        torch.tensor(layer_3, device=device).unsqueeze(0)
+    ]
+    audio_hat = snac_model.decode(codes)
+    return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
+# Main generation function
+@spaces.GPU()
+def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200, voice="Elise", progress=gr.Progress()):
+    if not text.strip():
+        return None
     try:
+        progress(0.1, "Processing text...")
+        input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
+        progress(0.3, "Generating speech tokens...")
         with torch.no_grad():
             generated_ids = model.generate(
                 input_ids=input_ids,
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
                 eos_token_id=128258,
             )
+        progress(0.6, "Processing speech tokens...")
         code_list = parse_output(generated_ids)
+        progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
+        return (24000, audio_samples)  # Return sample rate and audio
     except Exception as e:
         print(f"Error generating speech: {e}")
         return None
+# Examples for the UI - Khmer text examples
 examples = [
+    ["ជំរាបសួរ ខ្ញុំឈ្មោះ Kiri ហើយខ្ញុំគឺជាម៉ូដែលផលិតសំលេងនិយាយ។"],
     ["ខ្ញុំអាចបង្កើតសំលេងនិយាយផ្សេងៗ ដូចជា <laugh> សើច ឬ <sigh> ថប់ដង្ហើម។"],
+    ["ខ្ញុំរស់នៅក្នុងទីក្រុងភ្នំពេញ ហើយមានចរាចរណ៍ <gasp> ច្រើនណាស់។"],
+    ["ពេលខ្លះ ពេលខ្ញុំនិយាយច្រើនពេក ខ្ញុំត្រូវ <cough> សុំទោស។"],
+    ["ការនិយាយនៅចំពោះមុខសាធារណៈ អាចមានការពិបាក។ <groan> ប៉ុន្តែបើហាត់ គេអាចធ្វើបាន។"],
 ]
+# Available voices (commented out for simpler UI)
+# VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe", "jing", "Elise"]
+# Available Emotive Tags
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
+# Create Gradio interface
 with gr.Blocks(title="Khmer Text-to-Speech") as demo:
     gr.Markdown(f"""
     # 🎵 Khmer Text-to-Speech
     **ម៉ូដែលបម្លែងអត្ថបទជាសំលេង**
     បញ្ចូលអត្ថបទខ្មែររបស់អ្នក ហើយស្តាប់ការបម្លែងទៅជាសំលេងនិយាយ។
     💡 **Tips**: Add emotive tags like {", ".join(EMOTIVE_TAGS)} for more expressive speech!
+    """)
     text_input = gr.Textbox(
+        label="Enter Khmer text (បញ្ចូលអត្ថបទខ្មែរ)",
+        placeholder="បញ្ចូលអត្ថបទខ្មែររបស់អ្នកនៅទីនេះ...",
+        lines=4
     )
+    # Voice selector (commented out)
+    # voice = gr.Dropdown(
+    #     choices=VOICES,
+    #     value="tara",
+    #     label="Voice (សំលេង)"
+    # )
+    # Advanced Settings
     with gr.Accordion("🔧 Advanced Settings", open=False):
         with gr.Row():
             temperature = gr.Slider(
                 minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                label="Temperature",
+                info="Higher values create more expressive speech"
             )
             top_p = gr.Slider(
                 minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                label="Top P",
+                info="Nucleus sampling threshold"
             )
         with gr.Row():
             repetition_penalty = gr.Slider(
                 minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                label="Repetition Penalty",
+                info="Higher values discourage repetitive patterns"
             )
             max_new_tokens = gr.Slider(
+                minimum=100, maximum=2000, value=1200, step=100,
+                label="Max Length",
+                info="Maximum length of generated audio"
             )
     with gr.Row():
         clear_btn = gr.Button("🗑️ Clear", size="lg")
     audio_output = gr.Audio(
+        label="Generated Speech (សំលេងដែលបង្កើតឡើង)",
         type="numpy",
         show_label=True
     )
+    # Set up examples (NO CACHE)
     gr.Examples(
         examples=examples,
         inputs=[text_input],
         cache_examples=False,
     )
+    # Set up event handlers
     submit_btn.click(
         fn=generate_speech,
+        inputs=[text_input, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output
     )
         inputs=[],
         outputs=[text_input, audio_output]
     )
+# Launch the app
 if __name__ == "__main__":
+    demo.queue().launch(share=False)