shukdevdatta123 commited on
Commit
4ee577e
·
verified ·
1 Parent(s): 523a466

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -20
app.py CHANGED
@@ -27,61 +27,60 @@ torch.load = original_load
27
  def preprocess_audio_to_npz(audio_path):
28
  """
29
  Preprocess an audio file to create a .npz history prompt for voice cloning.
30
-
31
  Parameters:
32
  audio_path (str): Path to the input audio file.
33
-
34
  Returns:
35
  str: Path to the generated .npz file.
36
  """
37
  # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
38
  audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
39
-
40
  # Ensure audio is a float32 array
41
  audio = audio.astype(np.float32)
42
-
43
  with torch.device("cpu"):
44
  # Generate dummy semantic tokens using generate_text_semantic
45
  dummy_text = "Dummy text for history prompt generation."
46
  semantic_tokens = generate_text_semantic(
47
  text=dummy_text,
48
- max_gen_len=512,
49
  temp=0.7,
50
  silent=True
51
  )
52
-
53
  # Ensure semantic_tokens is a numpy array with correct shape
54
  semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
55
  if semantic_tokens.ndim == 0:
56
  semantic_tokens = semantic_tokens.reshape(-1)
57
-
58
  # Coarse and fine prompts are derived from semantic tokens
59
  # Bark often uses similar tokens for coarse and fine prompts
60
  coarse_tokens = semantic_tokens # Simplified assumption
61
  fine_tokens = semantic_tokens # Simplified assumption
62
-
63
  # Create history prompt dictionary
64
  history_prompt = {
65
  "semantic_prompt": semantic_tokens,
66
  "coarse_prompt": coarse_tokens,
67
  "fine_prompt": fine_tokens
68
  }
69
-
70
  # Save to temporary .npz file
71
  with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
72
  np.savez(temp_file.name, **history_prompt)
73
  npz_path = temp_file.name
74
-
75
  return npz_path
76
 
77
  def generate_speech(reference_audio, text):
78
  """
79
  Generate speech audio mimicking the voice from the reference audio using Bark.
80
-
81
  Parameters:
82
  reference_audio (str): Filepath to the uploaded voice sample.
83
  text (str): Text to convert to speech.
84
-
85
  Returns:
86
  str: Path to the generated audio file.
87
  """
@@ -89,25 +88,25 @@ def generate_speech(reference_audio, text):
89
  raise ValueError("Please upload a voice sample.")
90
  if not text:
91
  raise ValueError("Please enter text to convert.")
92
-
93
  # Preprocess audio to create .npz history prompt
94
  history_prompt = preprocess_audio_to_npz(reference_audio)
95
-
96
  # Generate speech using the processed history prompt
97
  audio_array = generate_audio(text, history_prompt=history_prompt)
98
-
99
  # Save the audio to a temporary file
100
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
101
  write_wav(temp_file.name, SAMPLE_RATE, audio_array)
102
  temp_file_path = temp_file.name
103
-
104
  return temp_file_path
105
 
106
  # Build the Gradio interface
107
  with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
108
  gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
109
  gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
110
-
111
  with gr.Row():
112
  audio_input = gr.Audio(
113
  type="filepath",
@@ -118,10 +117,10 @@ with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
118
  label="Enter Text to Convert to Speech",
119
  placeholder="e.g., I love chocolate"
120
  )
121
-
122
  generate_btn = gr.Button("Generate Speech")
123
  audio_output = gr.Audio(label="Generated Speech", interactive=False)
124
-
125
  # Connect the button to the generation function
126
  generate_btn.click(
127
  fn=generate_speech,
@@ -130,4 +129,4 @@ with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
130
  )
131
 
132
  # Launch the application
133
- app.launch()
 
27
  def preprocess_audio_to_npz(audio_path):
28
  """
29
  Preprocess an audio file to create a .npz history prompt for voice cloning.
30
+
31
  Parameters:
32
  audio_path (str): Path to the input audio file.
33
+
34
  Returns:
35
  str: Path to the generated .npz file.
36
  """
37
  # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
38
  audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
39
+
40
  # Ensure audio is a float32 array
41
  audio = audio.astype(np.float32)
42
+
43
  with torch.device("cpu"):
44
  # Generate dummy semantic tokens using generate_text_semantic
45
  dummy_text = "Dummy text for history prompt generation."
46
  semantic_tokens = generate_text_semantic(
47
  text=dummy_text,
 
48
  temp=0.7,
49
  silent=True
50
  )
51
+
52
  # Ensure semantic_tokens is a numpy array with correct shape
53
  semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
54
  if semantic_tokens.ndim == 0:
55
  semantic_tokens = semantic_tokens.reshape(-1)
56
+
57
  # Coarse and fine prompts are derived from semantic tokens
58
  # Bark often uses similar tokens for coarse and fine prompts
59
  coarse_tokens = semantic_tokens # Simplified assumption
60
  fine_tokens = semantic_tokens # Simplified assumption
61
+
62
  # Create history prompt dictionary
63
  history_prompt = {
64
  "semantic_prompt": semantic_tokens,
65
  "coarse_prompt": coarse_tokens,
66
  "fine_prompt": fine_tokens
67
  }
68
+
69
  # Save to temporary .npz file
70
  with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
71
  np.savez(temp_file.name, **history_prompt)
72
  npz_path = temp_file.name
73
+
74
  return npz_path
75
 
76
  def generate_speech(reference_audio, text):
77
  """
78
  Generate speech audio mimicking the voice from the reference audio using Bark.
79
+
80
  Parameters:
81
  reference_audio (str): Filepath to the uploaded voice sample.
82
  text (str): Text to convert to speech.
83
+
84
  Returns:
85
  str: Path to the generated audio file.
86
  """
 
88
  raise ValueError("Please upload a voice sample.")
89
  if not text:
90
  raise ValueError("Please enter text to convert.")
91
+
92
  # Preprocess audio to create .npz history prompt
93
  history_prompt = preprocess_audio_to_npz(reference_audio)
94
+
95
  # Generate speech using the processed history prompt
96
  audio_array = generate_audio(text, history_prompt=history_prompt)
97
+
98
  # Save the audio to a temporary file
99
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
100
  write_wav(temp_file.name, SAMPLE_RATE, audio_array)
101
  temp_file_path = temp_file.name
102
+
103
  return temp_file_path
104
 
105
  # Build the Gradio interface
106
  with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
107
  gr.Markdown("## Voice Cloning Text-to-Speech with Bark")
108
  gr.Markdown("Upload a short voice sample in English (5-10 seconds recommended), then enter text to hear it in your voice!")
109
+
110
  with gr.Row():
111
  audio_input = gr.Audio(
112
  type="filepath",
 
117
  label="Enter Text to Convert to Speech",
118
  placeholder="e.g., I love chocolate"
119
  )
120
+
121
  generate_btn = gr.Button("Generate Speech")
122
  audio_output = gr.Audio(label="Generated Speech", interactive=False)
123
+
124
  # Connect the button to the generation function
125
  generate_btn.click(
126
  fn=generate_speech,
 
129
  )
130
 
131
  # Launch the application
132
+ app.launch(share=True)