Spaces:

shukdevdatta123
/

VocalForge-AI

Paused

App Files Files Community

shukdevdatta123 commited on Jul 2

Commit

4ee577e

verified ·

1 Parent(s): 523a466

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -20

app.py CHANGED Viewed

@@ -27,61 +27,60 @@ torch.load = original_load
 def preprocess_audio_to_npz(audio_path):
     """
     Preprocess an audio file to create a .npz history prompt for voice cloning.
     Parameters:
     audio_path (str): Path to the input audio file.
     Returns:
     str: Path to the generated .npz file.
     """
     # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
     audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
     # Ensure audio is a float32 array
     audio = audio.astype(np.float32)
     with torch.device("cpu"):
         # Generate dummy semantic tokens using generate_text_semantic
         dummy_text = "Dummy text for history prompt generation."
         semantic_tokens = generate_text_semantic(
             text=dummy_text,
-            max_gen_len=512,
             temp=0.7,
             silent=True
         )
         # Ensure semantic_tokens is a numpy array with correct shape
         semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
         if semantic_tokens.ndim == 0:
             semantic_tokens = semantic_tokens.reshape(-1)
         # Coarse and fine prompts are derived from semantic tokens
         # Bark often uses similar tokens for coarse and fine prompts
         coarse_tokens = semantic_tokens  # Simplified assumption
         fine_tokens = semantic_tokens    # Simplified assumption
         # Create history prompt dictionary
         history_prompt = {
             "semantic_prompt": semantic_tokens,
             "coarse_prompt": coarse_tokens,
             "fine_prompt": fine_tokens
         }
         # Save to temporary .npz file
         with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
             np.savez(temp_file.name, **history_prompt)
             npz_path = temp_file.name
     return npz_path
 def generate_speech(reference_audio, text):
     """
     Generate speech audio mimicking the voice from the reference audio using Bark.
     Parameters:
     reference_audio (str): Filepath to the uploaded voice sample.
     text (str): Text to convert to speech.
     Returns:
     str: Path to the generated audio file.
     """
@@ -89,25 +88,25 @@ def generate_speech(reference_audio, text):
         raise ValueError("Please upload a voice sample.")
     if not text:
         raise ValueError("Please enter text to convert.")
     # Preprocess audio to create .npz history prompt
     history_prompt = preprocess_audio_to_npz(reference_audio)
     # Generate speech using the processed history prompt
     audio_array = generate_audio(text, history_prompt=history_prompt)
     # Save the audio to a temporary file
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
         write_wav(temp_file.name, SAMPLE_RATE, audio_array)
         temp_file_path = temp_file.name
     return temp_file_path
 # Build the Gradio interface
 with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
     gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
     gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
     with gr.Row():
         audio_input = gr.Audio(
             type="filepath",
@@ -118,10 +117,10 @@ with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
             label="Enter Text to Convert to Speech",
             placeholder="e.g., I love chocolate"
         )
     generate_btn = gr.Button("Generate Speech")
     audio_output = gr.Audio(label="Generated Speech", interactive=False)
     # Connect the button to the generation function
     generate_btn.click(
         fn=generate_speech,
@@ -130,4 +129,4 @@ with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
     )
 # Launch the application
-app.launch()

 def preprocess_audio_to_npz(audio_path):
     """
     Preprocess an audio file to create a .npz history prompt for voice cloning.
     Parameters:
     audio_path (str): Path to the input audio file.
     Returns:
     str: Path to the generated .npz file.
     """
     # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
     audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
     # Ensure audio is a float32 array
     audio = audio.astype(np.float32)
     with torch.device("cpu"):
         # Generate dummy semantic tokens using generate_text_semantic
         dummy_text = "Dummy text for history prompt generation."
         semantic_tokens = generate_text_semantic(
             text=dummy_text,
             temp=0.7,
             silent=True
         )
         # Ensure semantic_tokens is a numpy array with correct shape
         semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
         if semantic_tokens.ndim == 0:
             semantic_tokens = semantic_tokens.reshape(-1)
         # Coarse and fine prompts are derived from semantic tokens
         # Bark often uses similar tokens for coarse and fine prompts
         coarse_tokens = semantic_tokens  # Simplified assumption
         fine_tokens = semantic_tokens    # Simplified assumption
         # Create history prompt dictionary
         history_prompt = {
             "semantic_prompt": semantic_tokens,
             "coarse_prompt": coarse_tokens,
             "fine_prompt": fine_tokens
         }
         # Save to temporary .npz file
         with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
             np.savez(temp_file.name, **history_prompt)
             npz_path = temp_file.name
     return npz_path
 def generate_speech(reference_audio, text):
     """
     Generate speech audio mimicking the voice from the reference audio using Bark.
     Parameters:
     reference_audio (str): Filepath to the uploaded voice sample.
     text (str): Text to convert to speech.
     Returns:
     str: Path to the generated audio file.
     """
         raise ValueError("Please upload a voice sample.")
     if not text:
         raise ValueError("Please enter text to convert.")
     # Preprocess audio to create .npz history prompt
     history_prompt = preprocess_audio_to_npz(reference_audio)
     # Generate speech using the processed history prompt
     audio_array = generate_audio(text, history_prompt=history_prompt)
     # Save the audio to a temporary file
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
         write_wav(temp_file.name, SAMPLE_RATE, audio_array)
         temp_file_path = temp_file.name
     return temp_file_path
 # Build the Gradio interface
 with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
     gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
     gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
     with gr.Row():
         audio_input = gr.Audio(
             type="filepath",
             label="Enter Text to Convert to Speech",
             placeholder="e.g., I love chocolate"
         )
     generate_btn = gr.Button("Generate Speech")
     audio_output = gr.Audio(label="Generated Speech", interactive=False)
     # Connect the button to the generation function
     generate_btn.click(
         fn=generate_speech,
     )
 # Launch the application
+app.launch(share=True)