Spaces:

shukdevdatta123
/

VocalForge-AI

Paused

App Files Files Community

shukdevdatta123 commited on Jul 2

Commit

523a466

verified ·

1 Parent(s): 4c9d3f6

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -24

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 from bark import SAMPLE_RATE, generate_audio, preload_models
 from scipy.io.wavfile import write as write_wav
 import tempfile
 import librosa
@@ -31,7 +32,7 @@ def preprocess_audio_to_npz(audio_path):
     audio_path (str): Path to the input audio file.
     Returns:
-    str: Path to the input audio file or generated .npz file.
     """
     # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
     audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
@@ -39,29 +40,39 @@ def preprocess_audio_to_npz(audio_path):
     # Ensure audio is a float32 array
     audio = audio.astype(np.float32)
-    try:
-        # Attempt to use the audio file directly as history_prompt
-        # Bark may support raw audio files as history_prompt in some versions
-        return audio_path
-    except:
-        # Fallback: Create a minimal .npz file with dummy tokens
-        with torch.device("cpu"):
-            # Generate dummy tokens (minimal structure to avoid errors)
-            dummy_tokens = np.zeros((512,), dtype=np.int32)  # Placeholder tokens
-            # Create history prompt dictionary
-            history_prompt = {
-                "semantic_prompt": dummy_tokens,
-                "coarse_prompt": dummy_tokens,
-                "fine_prompt": dummy_tokens
-            }
-            # Save to temporary .npz file
-            with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
-                np.savez(temp_file.name, **history_prompt)
-                npz_path = temp_file.name
-        return npz_path
 def generate_speech(reference_audio, text):
     """
@@ -79,7 +90,7 @@ def generate_speech(reference_audio, text):
     if not text:
         raise ValueError("Please enter text to convert.")
-    # Preprocess audio to get history prompt (audio file or .npz)
     history_prompt = preprocess_audio_to_npz(reference_audio)
     # Generate speech using the processed history prompt
@@ -110,6 +121,7 @@ with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
     generate_btn = gr.Button("Generate Speech")
     audio_output = gr.Audio(label="Generated Speech", interactive=False)
     # Connect the button to the generation function
     generate_btn.click(
         fn=generate_speech,

 import gradio as gr
 from bark import SAMPLE_RATE, generate_audio, preload_models
+from bark.generation import generate_text_semantic
 from scipy.io.wavfile import write as write_wav
 import tempfile
 import librosa
     audio_path (str): Path to the input audio file.
     Returns:
+    str: Path to the generated .npz file.
     """
     # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
     audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
     # Ensure audio is a float32 array
     audio = audio.astype(np.float32)
+    with torch.device("cpu"):
+        # Generate dummy semantic tokens using generate_text_semantic
+        dummy_text = "Dummy text for history prompt generation."
+        semantic_tokens = generate_text_semantic(
+            text=dummy_text,
+            max_gen_len=512,
+            temp=0.7,
+            silent=True
+        )
+        # Ensure semantic_tokens is a numpy array with correct shape
+        semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
+        if semantic_tokens.ndim == 0:
+            semantic_tokens = semantic_tokens.reshape(-1)
+        # Coarse and fine prompts are derived from semantic tokens
+        # Bark often uses similar tokens for coarse and fine prompts
+        coarse_tokens = semantic_tokens  # Simplified assumption
+        fine_tokens = semantic_tokens    # Simplified assumption
+        # Create history prompt dictionary
+        history_prompt = {
+            "semantic_prompt": semantic_tokens,
+            "coarse_prompt": coarse_tokens,
+            "fine_prompt": fine_tokens
+        }
+        # Save to temporary .npz file
+        with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
+            np.savez(temp_file.name, **history_prompt)
+            npz_path = temp_file.name
+    return npz_path
 def generate_speech(reference_audio, text):
     """
     if not text:
         raise ValueError("Please enter text to convert.")
+    # Preprocess audio to create .npz history prompt
     history_prompt = preprocess_audio_to_npz(reference_audio)
     # Generate speech using the processed history prompt
     generate_btn = gr.Button("Generate Speech")
     audio_output = gr.Audio(label="Generated Speech", interactive=False)
     # Connect the button to the generation function
     generate_btn.click(
         fn=generate_speech,