Spaces:

shukdevdatta123
/

VocalForge-AI

Running

App Files Files Community

shukdevdatta123 commited on 15 days ago

Commit

4cc61f6

verified ·

1 Parent(s): c120dc7

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -110

app.py CHANGED Viewed

@@ -1,129 +1,51 @@
 import gradio as gr
-from bark import SAMPLE_RATE, generate_audio, preload_models
-from bark.generation import generate_text_semantic
-from scipy.io.wavfile import write as write_wav
-import tempfile
-import librosa
 import numpy as np
-import torch
-# Save the original torch.load function
-original_load = torch.load
-# Define a custom load function to bypass weights_only=True issue
-def custom_load(*args, **kwargs):
-    kwargs['weights_only'] = False
-    return original_load(*args, **kwargs)
-# Monkey-patch torch.load
-torch.load = custom_load
-# Preload Bark models
-preload_models()
-# Restore the original torch.load
-torch.load = original_load
-def preprocess_audio_to_npz(audio_path):
-    """
-    Preprocess an audio file to create a .npz history prompt for voice cloning.
-    Parameters:
-    audio_path (str): Path to the input audio file.
-    Returns:
-    str: Path to the generated .npz file.
-    """
-    # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
-    audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
-    # Ensure audio is a float32 array (for potential future use)
-    audio = audio.astype(np.float32)
-    with torch.device("cpu"):
-        # Generate semantic tokens using generate_text_semantic
-        dummy_text = "Dummy text for history prompt generation."
-        semantic_tokens = generate_text_semantic(
-            text=dummy_text,
-            temp=0.7,
-            silent=True
-        )
-        # Ensure semantic_tokens is a 1D numpy array of int64
-        semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
-        if semantic_tokens.ndim != 1:
-            semantic_tokens = semantic_tokens.flatten()
-        # Simulate coarse tokens (typically shorter or quantized version of semantic tokens)
-        coarse_tokens = semantic_tokens[:256]  # Truncate to simulate coarse quantization
-        coarse_tokens = np.array(coarse_tokens, dtype=np.int64)
-        # Simulate fine tokens (often similar to coarse tokens in Bark)
-        fine_tokens = coarse_tokens.copy()  # Simplified assumption
-        fine_tokens = np.array(fine_tokens, dtype=np.int64)
-        # Create history prompt dictionary
-        history_prompt = {
-            "semantic_prompt": semantic_tokens,
-            "coarse_prompt": coarse_tokens,
-            "fine_prompt": fine_tokens
-        }
-        # Save to temporary .npz file
-        with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
-            np.savez(temp_file.name, **history_prompt)
-            npz_path = temp_file.name
-    return npz_path
 def generate_speech(reference_audio, text):
     """
-    Generate speech audio mimicking the voice from the reference audio using Bark.
     Parameters:
     reference_audio (str): Filepath to the uploaded voice sample.
     text (str): Text to convert to speech.
     Returns:
-    str: Path to the generated audio file.
     """
-    if not reference_audio:
-        raise ValueError("Please upload a voice sample.")
-    if not text:
-        raise ValueError("Please enter text to convert.")
-    # Preprocess audio to create .npz history prompt
-    history_prompt = preprocess_audio_to_npz(reference_audio)
-    # Generate speech using the processed history prompt
-    audio_array = generate_audio(text, history_prompt=history_prompt)
-    # Save the audio to a temporary file
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
-        write_wav(temp_file.name, SAMPLE_RATE, audio_array)
-        temp_file_path = temp_file.name
     return temp_file_path
 # Build the Gradio interface
-with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
-    gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
-    gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
     with gr.Row():
-        audio_input = gr.Audio(
-            type="filepath",
-            label="Upload Your Voice Sample (English)",
-            interactive=True
-        )
-        text_input = gr.Textbox(
-            label="Enter Text to Convert to Speech",
-            placeholder="e.g., I love chocolate"
-        )
     generate_btn = gr.Button("Generate Speech")
     audio_output = gr.Audio(label="Generated Speech", interactive=False)
     # Connect the button to the generation function
     generate_btn.click(
         fn=generate_speech,
@@ -132,4 +54,4 @@ with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
     )
 # Launch the application
-app.launch(share=True)

 import gradio as gr
+from TTS.api import TTS
 import numpy as np
+from scipy.io import wavfile
+import tempfile
+import os
+# Load the YourTTS model once at startup
+tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)
+sample_rate = tts.synthesizer.output_sample_rate
 def generate_speech(reference_audio, text):
     """
+    Generate speech audio mimicking the voice from the reference audio.
     Parameters:
     reference_audio (str): Filepath to the uploaded voice sample.
     text (str): Text to convert to speech.
     Returns:
+    str: Path to the generated audio file
     """
+    # Generate speech using the reference audio and text
+    wav = tts.tts(text=text, speaker_wav=reference_audio, language="en")
+    # Convert list to numpy array
+    wav_np = np.array(wav, dtype=np.float32)
+    # Create a temporary file to save the audio
+    temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    temp_file_path = temp_file.name
+    # Save the audio to the temporary file
+    wavfile.write(temp_file_path, sample_rate, wav_np)
+    temp_file.close()
     return temp_file_path
 # Build the Gradio interface
+with gr.Blocks(title="Voice Cloning TTS") as app:
+    gr.Markdown("## Voice Cloning Text-to-Speech")
+    gr.Markdown("Upload a short voice sample in English, then enter text to hear it in your voice!")
     with gr.Row():
+        audio_input = gr.Audio(type="filepath", label="Upload Your Voice Sample (English)")
+        text_input = gr.Textbox(label="Enter Text to Convert to Speech", placeholder="e.g., I love chocolate")
     generate_btn = gr.Button("Generate Speech")
     audio_output = gr.Audio(label="Generated Speech", interactive=False)
     # Connect the button to the generation function
     generate_btn.click(
         fn=generate_speech,
     )
 # Launch the application
+app.launch()