Spaces:

shukdevdatta123
/

VocalForge-AI

Running

App Files Files Community

shukdevdatta123 commited on Jul 2

Commit

e25f277

verified ·

1 Parent(s): e47cdda

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -27

app.py CHANGED Viewed

@@ -1,39 +1,96 @@
 import gradio as gr
 from bark import SAMPLE_RATE, generate_audio, preload_models
 from scipy.io.wavfile import write as write_wav
 import tempfile
 import torch
 # Save the original torch.load function
 original_load = torch.load
-# Define a custom load function that forces weights_only=False
 def custom_load(*args, **kwargs):
     kwargs['weights_only'] = False
     return original_load(*args, **kwargs)
-# Monkey-patch torch.load with the custom function
 torch.load = custom_load
-# Preload the models with the patched torch.load
 preload_models()
-# Restore the original torch.load function
 torch.load = original_load
 def generate_speech(reference_audio, text):
     """
-    Generate speech audio using a pre-defined speaker.
     Parameters:
-    reference_audio (str): Path to uploaded audio (ignored in this version).
     text (str): Text to convert to speech.
     Returns:
     str: Path to the generated audio file.
     """
-    # Use a pre-defined speaker since custom voice cloning isn't supported
-    history_prompt = "v2/en_speaker_6"  # Pre-defined speaker ID
     audio_array = generate_audio(text, history_prompt=history_prompt)
     # Save the audio to a temporary file
@@ -44,29 +101,25 @@ def generate_speech(reference_audio, text):
     return temp_file_path
 # Build the Gradio interface
-with gr.Blocks(title="Text-to-Speech with Bark") as app:
-    gr.Markdown("## Text-to-Speech with Bark")
-    gr.Markdown(
-        "Enter text to hear it in a pre-defined voice. "
-        "Custom voice cloning from uploaded audio is not supported in this version."
-    )
-    # Input components
-    audio_input = gr.Audio(
-        type="filepath",
-        label="Upload Your Voice Sample (English, Ignored)",
-        visible=True  # Kept for future functionality, but ignored
-    )
-    text_input = gr.Textbox(
-        label="Enter Text to Convert to Speech",
-        placeholder="e.g., I love chocolate"
-    )
-    # Output component
     audio_output = gr.Audio(label="Generated Speech", interactive=False)
-    # Button to trigger generation
-    generate_btn = gr.Button("Generate Speech")
     generate_btn.click(
         fn=generate_speech,
         inputs=[audio_input, text_input],

 import gradio as gr
 from bark import SAMPLE_RATE, generate_audio, preload_models
+from bark.generation import load_model, generate_text_semantic, _tokenize
 from scipy.io.wavfile import write as write_wav
 import tempfile
 import torch
+import librosa
+import numpy as np
 # Save the original torch.load function
 original_load = torch.load
+# Define a custom load function to bypass weights_only=True issue
 def custom_load(*args, **kwargs):
     kwargs['weights_only'] = False
     return original_load(*args, **kwargs)
+# Monkey-patch torch.load
 torch.load = custom_load
+# Preload Bark models
 preload_models()
+# Restore the original torch.load
 torch.load = original_load
+def preprocess_audio_to_npz(audio_path):
+    """
+    Preprocess an audio file to create a .npz history prompt for voice cloning.
+    Parameters:
+    audio_path (str): Path to the input audio file.
+    Returns:
+    str: Path to the generated .npz file.
+    """
+    # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
+    audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
+    # Ensure audio is a float32 array
+    audio = audio.astype(np.float32)
+    # Tokenize and process through HuBERT for semantic tokens
+    hubert_manager = load_model(model_type="hubert", device="cpu")
+    hubert_tokenizer = load_model(model_type="hubert_tokenizer", device="cpu")
+    # Generate semantic tokens
+    tokens = _tokenize(audio, hubert_manager, hubert_tokenizer)
+    semantic_tokens = tokens[0]  # Extract semantic tokens
+    # Load coarse model for coarse tokens
+    coarse_model = load_model(model_type="coarse", device="cpu")
+    # Generate coarse tokens
+    coarse_tokens = generate_text_semantic(
+        semantic_tokens=semantic_tokens,
+        model=coarse_model,
+        max_gen_len=512
+    )
+    # Create history prompt dictionary
+    history_prompt = {
+        "semantic_prompt": semantic_tokens,
+        "coarse_prompt": coarse_tokens
+    }
+    # Save to temporary .npz file
+    with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
+        np.savez(temp_file.name, **history_prompt)
+        npz_path = temp_file.name
+    return npz_path
 def generate_speech(reference_audio, text):
     """
+    Generate speech audio mimicking the voice from the reference audio using Bark.
     Parameters:
+    reference_audio (str): Filepath to the uploaded voice sample.
     text (str): Text to convert to speech.
     Returns:
     str: Path to the generated audio file.
     """
+    if not reference_audio:
+        raise ValueError("Please upload a voice sample.")
+    if not text:
+        raise ValueError("Please enter text to convert.")
+    # Preprocess audio to create .npz history prompt
+    history_prompt = preprocess_audio_to_npz(reference_audio)
+    # Generate speech using the processed history prompt
     audio_array = generate_audio(text, history_prompt=history_prompt)
     # Save the audio to a temporary file
     return temp_file_path
 # Build the Gradio interface
+with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
+    gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
+    gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
+    with gr.Row():
+        audio_input = gr.Audio(
+            type="filepath",
+            label="Upload Your Voice Sample (English)",
+            interactive=True
+        )
+        text_input = gr.Textbox(
+            label="Enter Text to Convert to Speech",
+            placeholder="e.g., I love chocolate"
+        )
+    generate_btn = gr.Button("Generate Speech")
     audio_output = gr.Audio(label="Generated Speech", interactive=False)
+    # Connect the button to the generation function
     generate_btn.click(
         fn=generate_speech,
         inputs=[audio_input, text_input],