shukdevdatta123 commited on
Commit
523a466
·
verified ·
1 Parent(s): 4c9d3f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -24
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  from bark import SAMPLE_RATE, generate_audio, preload_models
 
3
  from scipy.io.wavfile import write as write_wav
4
  import tempfile
5
  import librosa
@@ -31,7 +32,7 @@ def preprocess_audio_to_npz(audio_path):
31
  audio_path (str): Path to the input audio file.
32
 
33
  Returns:
34
- str: Path to the input audio file or generated .npz file.
35
  """
36
  # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
37
  audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
@@ -39,29 +40,39 @@ def preprocess_audio_to_npz(audio_path):
39
  # Ensure audio is a float32 array
40
  audio = audio.astype(np.float32)
41
 
42
- try:
43
- # Attempt to use the audio file directly as history_prompt
44
- # Bark may support raw audio files as history_prompt in some versions
45
- return audio_path
46
- except:
47
- # Fallback: Create a minimal .npz file with dummy tokens
48
- with torch.device("cpu"):
49
- # Generate dummy tokens (minimal structure to avoid errors)
50
- dummy_tokens = np.zeros((512,), dtype=np.int32) # Placeholder tokens
51
-
52
- # Create history prompt dictionary
53
- history_prompt = {
54
- "semantic_prompt": dummy_tokens,
55
- "coarse_prompt": dummy_tokens,
56
- "fine_prompt": dummy_tokens
57
- }
58
-
59
- # Save to temporary .npz file
60
- with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
61
- np.savez(temp_file.name, **history_prompt)
62
- npz_path = temp_file.name
63
 
64
- return npz_path
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  def generate_speech(reference_audio, text):
67
  """
@@ -79,7 +90,7 @@ def generate_speech(reference_audio, text):
79
  if not text:
80
  raise ValueError("Please enter text to convert.")
81
 
82
- # Preprocess audio to get history prompt (audio file or .npz)
83
  history_prompt = preprocess_audio_to_npz(reference_audio)
84
 
85
  # Generate speech using the processed history prompt
@@ -110,6 +121,7 @@ with gr.Blocks(title="Voice Cloning TTS with Bark") as app:
110
 
111
  generate_btn = gr.Button("Generate Speech")
112
  audio_output = gr.Audio(label="Generated Speech", interactive=False)
 
113
  # Connect the button to the generation function
114
  generate_btn.click(
115
  fn=generate_speech,
 
1
  import gradio as gr
2
  from bark import SAMPLE_RATE, generate_audio, preload_models
3
+ from bark.generation import generate_text_semantic
4
  from scipy.io.wavfile import write as write_wav
5
  import tempfile
6
  import librosa
 
32
  audio_path (str): Path to the input audio file.
33
 
34
  Returns:
35
+ str: Path to the generated .npz file.
36
  """
37
  # Load and resample audio to Bark's SAMPLE_RATE (24kHz)
38
  audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
 
40
  # Ensure audio is a float32 array
41
  audio = audio.astype(np.float32)
42
 
43
+ with torch.device("cpu"):
44
+ # Generate dummy semantic tokens using generate_text_semantic
45
+ dummy_text = "Dummy text for history prompt generation."
46
+ semantic_tokens = generate_text_semantic(
47
+ text=dummy_text,
48
+ max_gen_len=512,
49
+ temp=0.7,
50
+ silent=True
51
+ )
52
+
53
+ # Ensure semantic_tokens is a numpy array with correct shape
54
+ semantic_tokens = np.array(semantic_tokens, dtype=np.int64)
55
+ if semantic_tokens.ndim == 0:
56
+ semantic_tokens = semantic_tokens.reshape(-1)
57
+
58
+ # Coarse and fine prompts are derived from semantic tokens
59
+ # Bark often uses similar tokens for coarse and fine prompts
60
+ coarse_tokens = semantic_tokens # Simplified assumption
61
+ fine_tokens = semantic_tokens # Simplified assumption
 
 
62
 
63
+ # Create history prompt dictionary
64
+ history_prompt = {
65
+ "semantic_prompt": semantic_tokens,
66
+ "coarse_prompt": coarse_tokens,
67
+ "fine_prompt": fine_tokens
68
+ }
69
+
70
+ # Save to temporary .npz file
71
+ with tempfile.NamedTemporaryFile(suffix=".npz", delete=False) as temp_file:
72
+ np.savez(temp_file.name, **history_prompt)
73
+ npz_path = temp_file.name
74
+
75
+ return npz_path
76
 
77
  def generate_speech(reference_audio, text):
78
  """
 
90
  if not text:
91
  raise ValueError("Please enter text to convert.")
92
 
93
+ # Preprocess audio to create .npz history prompt
94
  history_prompt = preprocess_audio_to_npz(reference_audio)
95
 
96
  # Generate speech using the processed history prompt
 
121
 
122
  generate_btn = gr.Button("Generate Speech")
123
  audio_output = gr.Audio(label="Generated Speech", interactive=False)
124
+
125
  # Connect the button to the generation function
126
  generate_btn.click(
127
  fn=generate_speech,