Spaces:

Hassan-16
/

TTS

Running

App Files Files Community

Hassan-16 commited on Jun 28

Commit

ee617da

verified ·

1 Parent(s): cbcfe99

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -257

app.py CHANGED Viewed

@@ -1,257 +1,15 @@
-from kokoro import KModel, KPipeline
-import gradio as gr
-import os
-import random
-import torch
-import logging
-import soundfile as sf
-# Optional: import Resemblyzer for voice cloning (install via pip install resemblyzer)
-try:
-    from resemblyzer import VoiceEncoder, preprocess_wav
-    encoder = VoiceEncoder()
-except ImportError:
-    encoder = None
-# Configuration
-VOICE_DIR = r"D:\New folder (2)\model\voices"
-OUTPUT_DIR = r"D:\New folder (2)\output_audio"
-TEXT = "Hello, this is a test of the Kokoro TTS system."
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Device setup
-CUDA_AVAILABLE = torch.cuda.is_available()
-device = "cuda" if CUDA_AVAILABLE else "cpu"
-logger.info(f"Using hardware: {device}")
-# Load models for CPU and GPU (if available)
-models = {gpu: KModel("hexgrad/Kokoro-82M").to("cuda" if gpu else "cpu").eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
-# Define pipelines for American ('a') and British ('b') English
-pipelines = {
-    'a': KPipeline(model=models[False], lang_code='a', device='cpu'),  # American English
-    'b': KPipeline(model=models[False], lang_code='b', device='cpu')   # British English
-}
-# Set custom pronunciations for "kokoro" in both American and British modes
-try:
-    pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
-    pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"
-except AttributeError as e:
-    logger.warning(f"Could not set custom pronunciations: {e}")
-def forward_gpu(text, voice_path, speed):
-    # Use the GPU model directly without spaces.GPU decorator
-    pipeline = pipelines[voice_path[0]]
-    # Ensure the pipeline uses the GPU model
-    pipeline.model = models[True]  # Switch to GPU model
-    generator = pipeline(text, voice=voice_path, speed=speed)
-    for _, _, audio in generator:
-        return audio
-    return None
-def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE, clone_voice_file=None):
-    voice_path = os.path.join(VOICE_DIR, voice)
-    if not os.path.exists(voice_path):
-        raise FileNotFoundError(f"Voice file not found: {voice_path}")
-    pipeline = pipelines[voice[0]]
-    # If a clone file is provided and the encoder is available, try to clone the voice
-    if clone_voice_file is not None and encoder is not None:
-        try:
-            # clone_voice_file is a file path (string) in Gradio with type="filepath"
-            wav = preprocess_wav(clone_voice_file)
-            cloned_voice = torch.tensor(encoder.embed_utterance(wav), device=device).unsqueeze(0)
-            temp_voice_path = os.path.join(VOICE_DIR, "cloned_voice.pt")
-            torch.save(cloned_voice, temp_voice_path)
-            voice_path = temp_voice_path
-        except Exception as e:
-            logger.error(f"Error cloning voice: {e}")
-            voice_path = os.path.join(VOICE_DIR, voice)
-    use_gpu = use_gpu and CUDA_AVAILABLE
-    try:
-        if use_gpu:
-            audio = forward_gpu(text, voice_path, speed)
-        else:
-            pipeline.model = models[False]  # Ensure CPU model is used
-            generator = pipeline(text, voice=voice_path, speed=speed)
-            for _, ps, audio in generator:
-                return (24000, audio.numpy()), ps
-    except gr.exceptions.Error as e:
-        if use_gpu:
-            gr.Warning(str(e))
-            gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
-            pipeline.model = models[False]  # Switch to CPU model
-            generator = pipeline(text, voice=voice_path, speed=speed)
-            for _, ps, audio in generator:
-                return (24000, audio.numpy()), ps
-        else:
-            raise gr.Error(e)
-    return None, ""
-def predict(text, voice="af_bella.pt", speed=1):
-    return generate_first(text, voice, speed, use_gpu=False)[0]
-def tokenize_first(text, voice="af_bella.pt"):
-    voice_path = os.path.join(VOICE_DIR, voice)
-    if not os.path.exists(voice_path):
-        raise FileNotFoundError(f"Voice file not found: {voice_path}")
-    pipeline = pipelines[voice[0]]
-    generator = pipeline(text, voice=voice_path)
-    for _, ps, _ in generator:
-        return ps
-    return ""
-def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
-    voice_path = os.path.join(VOICE_DIR, voice)
-    if not os.path.exists(voice_path):
-        raise FileNotFoundError(f"Voice file not found: {voice_path}")
-    pipeline = pipelines[voice[0]]
-    use_gpu = use_gpu and CUDA_AVAILABLE
-    first = True
-    if use_gpu:
-        pipeline.model = models[True]  # Switch to GPU model
-    else:
-        pipeline.model = models[False]  # Switch to CPU model
-    generator = pipeline(text, voice=voice_path, speed=speed)
-    for _, _, audio in generator:
-        yield 24000, audio.numpy()
-        if first:
-            first = False
-            yield 24000, torch.zeros(1).numpy()
-# Load random quotes and sample texts
-try:
-    with open("en.txt", "r") as r:
-        random_quotes = [line.strip() for line in r]
-except FileNotFoundError:
-    random_quotes = ["Hello, this is a test of the Kokoro TTS system."]
-def get_random_quote():
-    return random.choice(random_quotes)
-def get_gatsby():
-    try:
-        with open("gatsby5k.md", "r") as r:
-            return r.read().strip()
-    except FileNotFoundError:
-        return "The Great Gatsby text not found."
-def get_frankenstein():
-    try:
-        with open("frankenstein5k.md", "r") as r:
-            return r.read().strip()
-    except FileNotFoundError:
-        return "Frankenstein text not found."
-# Dynamically load all .pt voice files from VOICE_DIR
-def load_voice_choices():
-    voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
-    choices = {}
-    for voice_file in voice_files:
-        # Determine the voice type based on the prefix
-        prefix = voice_file[:2]
-        if prefix == 'af':
-            label = f"🇺🇸 🚺 {voice_file[3:-3].capitalize()}"
-        elif prefix == 'am':
-            label = f"🇺🇸 🚹 {voice_file[3:-3].capitalize()}"
-        elif prefix == 'bf':
-            label = f"🇬🇧 🚺 {voice_file[3:-3].capitalize()}"
-        elif prefix == 'bm':
-            label = f"🇬🇧 🚹 {voice_file[3:-3].capitalize()}"
-        else:
-            label = f"Unknown {voice_file[:-3]}"
-        choices[label] = voice_file
-    return choices
-CHOICES = load_voice_choices()
-# Log available voices
-for label, voice_path in CHOICES.items():
-    full_path = os.path.join(VOICE_DIR, voice_path)
-    if not os.path.exists(full_path):
-        logger.warning(f"Voice file not found: {full_path}")
-    else:
-        logger.info(f"Loaded voice: {label} ({voice_path})")
-# If no voices are found, add a default fallback
-if not CHOICES:
-    logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
-    CHOICES = {"🇺🇸 🚺 Bella 🔥": "af_bella.pt"}
-TOKEN_NOTE = '''
-💡 Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkəɹO/)
-💬 To adjust intonation, try punctuation ;:,.!?—…"()“” or stress ˈ and ˌ
-⬇️ Lower stress [1 level](-1) or [2 levels](-2)
-⬆️ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
-'''
-with gr.Blocks() as generate_tab:
-    out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
-    generate_btn = gr.Button("Generate", variant="primary")
-    with gr.Accordion("Output Tokens", open=True):
-        out_ps = gr.Textbox(interactive=False, show_label=False,
-                            info="Tokens used to generate the audio, up to 510 context length.")
-        tokenize_btn = gr.Button("Tokenize", variant="secondary")
-        gr.Markdown(TOKEN_NOTE)
-        predict_btn = gr.Button("Predict", variant="secondary", visible=False)
-with gr.Blocks() as stream_tab:
-    out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
-    with gr.Row():
-        stream_btn = gr.Button("Stream", variant="primary")
-        stop_btn = gr.Button("Stop", variant="stop")
-    with gr.Accordion("Note", open=True):
-        gr.Markdown("⚠️ There is an unknown Gradio bug that might yield no audio the first time you click Stream.")
-        gr.DuplicateButton()
-API_OPEN = True
-with gr.Blocks() as app:
-    with gr.Row():
-        with gr.Column():
-            text = gr.Textbox(label="Input Text", info="Arbitrarily many characters supported")
-            with gr.Row():
-                voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
-                                    info="Quality and availability vary by language")
-                use_gpu = gr.Dropdown(
-                    [("GPU 🚀", True), ("CPU 🐌", False)],
-                    value=CUDA_AVAILABLE,
-                    label="Hardware",
-                    info="GPU is usually faster, but may require CUDA support",
-                    interactive=CUDA_AVAILABLE
-                )
-            speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
-            clone_voice_file = gr.File(label="Clone Voice Sample (Optional)", file_count="single", type="filepath")
-            random_btn = gr.Button("🎲 Random Quote 💬", variant="secondary")
-            with gr.Row():
-                gatsby_btn = gr.Button("🥂 Gatsby 📕", variant="secondary")
-                frankenstein_btn = gr.Button("💀 Frankenstein 📗", variant="secondary")
-        with gr.Column():
-            gr.TabbedInterface([generate_tab, stream_tab], ["Generate", "Stream"])
-    random_btn.click(fn=get_random_quote, inputs=[], outputs=[text])
-    gatsby_btn.click(fn=get_gatsby, inputs=[], outputs=[text])
-    frankenstein_btn.click(fn=get_frankenstein, inputs=[], outputs=[text])
-    generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu, clone_voice_file],
-                       outputs=[out_audio, out_ps])
-    tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
-    stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
-    stop_btn.click(fn=None, cancels=[stream_event])
-    predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio])
-if __name__ == "__main__":
-    app.queue(api_open=API_OPEN).launch(
-        server_name="127.0.0.1",
-        server_port=40001,
-        show_api=API_OPEN,
-        inbrowser=True
-    )

+import os
+import zipfile
+# Path to the zip file and extraction directory
+zip_path = "model/kokoro-v1_0.zip"
+extract_dir = "model/"
+# Check if the zip file exists and extract it
+if os.path.exists(zip_path):
+    print(f"Extracting {zip_path}...")
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(extract_dir)
+    print(f"Extraction completed. Files extracted to {extract_dir}")
+else:
+    print(f"File {zip_path} does not exist.")