shukdevdatta123 commited on
Commit
4cc61f6
·
verified ·
1 Parent(s): c120dc7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -110
app.py CHANGED
@@ -1,129 +1,51 @@
1
  import gradio as gr
2
- from bark import SAMPLE_RATE, generate_audio, preload_models
3
- from bark.generation import generate_text_semantic
4
- from scipy.io.wavfile import write as write_wav
5
- import tempfile
6
- import librosa
7
  import numpy as np
8
- import torch
9
-
10
- # Save the original torch.load function
11
- original_load = torch.load
12
-
13
- # Define a custom load function to bypass weights_only=True issue
14
- def custom_load(*args, **kwargs):
15
- kwargs['weights_only'] = False
16
- return original_load(*args, **kwargs)
17
-
18
- # Monkey-patch torch.load
19
- torch.load = custom_load
20
-
21
- # Preload Bark models
22
- preload_models()
23
-
24
- # Restore the original torch.load
25
- torch.load = original_load
26
-
27
- def preprocess_audio_to_npz(audio_path):
28
- """
29
- Preprocess an audio file to create a .npz history prompt for voice cloning.
30
-
31
- Parameters:
32
- audio_path (str): Path to the input audio file.
33
-
34
- Returns:
35
- str: Path to the generated .npz file.
36
- """
37
- # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
38
- audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
39
-
40
- # Ensure audio is a float32 array (for potential future use)
41
- audio = audio.astype(np.float32)
42
-
43
- with torch.device("cpu"):
44
- # Generate semantic tokens using generate_text_semantic
45
- dummy_text = "Dummy text for history prompt generation."
46
- semantic_tokens = generate_text_semantic(
47
- text=dummy_text,
48
- temp=0.7,
49
- silent=True
50
- )
51
-
52
- # Ensure semantic_tokens is a 1D numpy array of int64
53
- semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
54
- if semantic_tokens.ndim != 1:
55
- semantic_tokens = semantic_tokens.flatten()
56
-
57
- # Simulate coarse tokens (typically shorter or quantized version of semantic tokens)
58
- coarse_tokens = semantic_tokens[:256] # Truncate to simulate coarse quantization
59
- coarse_tokens = np.array(coarse_tokens, dtype=np.int64)
60
-
61
- # Simulate fine tokens (often similar to coarse tokens in Bark)
62
- fine_tokens = coarse_tokens.copy() # Simplified assumption
63
- fine_tokens = np.array(fine_tokens, dtype=np.int64)
64
-
65
- # Create history prompt dictionary
66
- history_prompt = {
67
- "semantic_prompt": semantic_tokens,
68
- "coarse_prompt": coarse_tokens,
69
- "fine_prompt": fine_tokens
70
- }
71
-
72
- # Save to temporary .npz file
73
- with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
74
- np.savez(temp_file.name, **history_prompt)
75
- npz_path = temp_file.name
76
 
77
- return npz_path
 
 
78
 
79
  def generate_speech(reference_audio, text):
80
  """
81
- Generate speech audio mimicking the voice from the reference audio using Bark.
82
-
83
  Parameters:
84
  reference_audio (str): Filepath to the uploaded voice sample.
85
  text (str): Text to convert to speech.
86
-
87
  Returns:
88
- str: Path to the generated audio file.
89
  """
90
- if not reference_audio:
91
- raise ValueError("Please upload a voice sample.")
92
- if not text:
93
- raise ValueError("Please enter text to convert.")
94
-
95
- # Preprocess audio to create .npz history prompt
96
- history_prompt = preprocess_audio_to_npz(reference_audio)
97
-
98
- # Generate speech using the processed history prompt
99
- audio_array = generate_audio(text, history_prompt=history_prompt)
100
-
101
- # Save the audio to a temporary file
102
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
103
- write_wav(temp_file.name, SAMPLE_RATE, audio_array)
104
- temp_file_path = temp_file.name
105
-
106
  return temp_file_path
107
 
108
  # Build the Gradio interface
109
- with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
110
- gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
111
- gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
112
-
113
  with gr.Row():
114
- audio_input = gr.Audio(
115
- type="filepath",
116
- label="Upload Your Voice Sample (English)",
117
- interactive=True
118
- )
119
- text_input = gr.Textbox(
120
- label="Enter Text to Convert to Speech",
121
- placeholder="e.g., I love chocolate"
122
- )
123
-
124
  generate_btn = gr.Button("Generate Speech")
125
  audio_output = gr.Audio(label="Generated Speech", interactive=False)
126
-
127
  # Connect the button to the generation function
128
  generate_btn.click(
129
  fn=generate_speech,
@@ -132,4 +54,4 @@ with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
132
  )
133
 
134
  # Launch the application
135
- app.launch(share=True)
 
1
  import gradio as gr
2
+ from TTS.api import TTS
 
 
 
 
3
  import numpy as np
4
+ from scipy.io import wavfile
5
+ import tempfile
6
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Load the YourTTS model once at startup
9
+ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)
10
+ sample_rate = tts.synthesizer.output_sample_rate
11
 
12
  def generate_speech(reference_audio, text):
13
  """
14
+ Generate speech audio mimicking the voice from the reference audio.
15
+
16
  Parameters:
17
  reference_audio (str): Filepath to the uploaded voice sample.
18
  text (str): Text to convert to speech.
19
+
20
  Returns:
21
+ str: Path to the generated audio file
22
  """
23
+ # Generate speech using the reference audio and text
24
+ wav = tts.tts(text=text, speaker_wav=reference_audio, language="en")
25
+ # Convert list to numpy array
26
+ wav_np = np.array(wav, dtype=np.float32)
27
+
28
+ # Create a temporary file to save the audio
29
+ temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
30
+ temp_file_path = temp_file.name
31
+ # Save the audio to the temporary file
32
+ wavfile.write(temp_file_path, sample_rate, wav_np)
33
+ temp_file.close()
34
+
 
 
 
 
35
  return temp_file_path
36
 
37
  # Build the Gradio interface
38
+ with gr.Blocks(title="Voice Cloning TTS") as app:
39
+ gr.Markdown("## Voice Cloning Text-to-Speech")
40
+ gr.Markdown("Upload a short voice sample in English, then enter text to hear it in your voice!")
41
+
42
  with gr.Row():
43
+ audio_input = gr.Audio(type="filepath", label="Upload Your Voice Sample (English)")
44
+ text_input = gr.Textbox(label="Enter Text to Convert to Speech", placeholder="e.g., I love chocolate")
45
+
 
 
 
 
 
 
 
46
  generate_btn = gr.Button("Generate Speech")
47
  audio_output = gr.Audio(label="Generated Speech", interactive=False)
48
+
49
  # Connect the button to the generation function
50
  generate_btn.click(
51
  fn=generate_speech,
 
54
  )
55
 
56
  # Launch the application
57
+ app.launch()