Spaces:

SohomToom
/

DocToAudioConverted

Sleeping

App Files Files Community

SohomToom commited on May 6

Commit

f2fdc48

verified ·

1 Parent(s): 1900d87

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -39

app.py CHANGED Viewed

@@ -14,6 +14,11 @@ from pydub import AudioSegment
 final_audio = AudioSegment.empty()
 # Voice model
 VOICE_MODEL = "tts_models/en/vctk/vits"
@@ -111,6 +116,19 @@ SPEAKER_METADATA = {
 	    273: { "age": 18, "gender": "F", "accent": "English"}
 	}
 def clean_text(text):
     # Remove hyperlinks
     return re.sub(r'http[s]?://\S+', '', text)
@@ -126,40 +144,58 @@ def list_speaker_choices():
 def get_speaker_id_from_label(label):
     return label.split('|')[0].strip()
-def generate_sample_audio(sample_text, speaker_label):
     if len(sample_text) > 500:
         raise gr.Error("Sample text exceeds 500 characters.")
     speaker_id = get_speaker_id_from_label(speaker_label)
-    model = TTS("tts_models/en/vctk/vits")
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
-        model.tts_to_file(text=sample_text, speaker="p"+speaker_id, file_path=tmp_wav.name)
-        return tmp_wav.name
-def generate_audio(docx_file, speaker_label):
     speaker_id = get_speaker_id_from_label(speaker_label)
-    if engine_choice == "Bark":
-        from bark import generate_audio
-        from bark.generation import preload_models
-        preload_models()
-        audio_array = generate_audio(sample_text)
-        tmp_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
-        AudioSegment(audio_array.tobytes(), frame_rate=24000, sample_width=2, channels=1).export(tmp_path, format="wav")
-        return tmp_path
-    else:
-        model = TTS("tts_models/en/vctk/vits")
         paragraphs = extract_paragraphs_from_docx(docx_file)
         combined_audio = AudioSegment.empty()
         temp_files = []
         try:
             for idx, para in enumerate(paragraphs):
-              tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-              model.tts_to_file(text=para, speaker="p"+speaker_id, file_path=tmp.name)
-              audio_chunk = AudioSegment.from_wav(tmp.name)
-              combined_audio += audio_chunk
-              temp_files.append(tmp.name)
-              tmp.close()
         except Exception as e:
             print("Generation interrupted. Saving partial output.", e)
@@ -167,14 +203,68 @@ def generate_audio(docx_file, speaker_label):
         output_dir = tempfile.mkdtemp()
         final_output_path = os.path.join(output_dir, "final_output.wav")
         combined_audio.export(final_output_path, format="wav")
         zip_path = os.path.join(output_dir, "output.zip")
         with zipfile.ZipFile(zip_path, 'w') as zipf:
-        zipf.write(final_output_path, arcname="final_output.wav")
         for f in temp_files:
-          os.remove(f)
-    return zip_path
 # --- UI ---
 speaker_choices = list_speaker_choices()
@@ -183,24 +273,17 @@ with gr.Blocks() as demo:
     gr.Markdown("## 📄 TTS Voice Generator with Paragraph-Wise Processing")
     with gr.Row():
         speaker_dropdown = gr.Dropdown(label="Select Voice", choices=speaker_choices)
     with gr.Row():
         sample_textbox = gr.Textbox(label="Enter Sample Text (Max 500 characters)", max_lines=5)
         sample_button = gr.Button("Generate Sample")
         clear_button = gr.Button("Clear Sample")
-    tts_engine_dropdown = gr.Dropdown(label="TTS Engine", choices=["Coqui (XTTS)", "Bark"], value="Coqui (XTTS)")
     sample_audio = gr.Audio(label="Sample Output", type="filepath")
-    sample_button.click(
-    fn=generate_sample_audio,
-    inputs=[sample_textbox, speaker_dropdown, tts_engine_dropdown],
-    outputs=[sample_audio]
-)
     clear_button.click(fn=lambda: None, inputs=[], outputs=[sample_audio])
     with gr.Row():
@@ -208,11 +291,7 @@ with gr.Blocks() as demo:
         generate_button = gr.Button("Generate Full Audio")
         download_output = gr.File(label="Download Output Zip")
-    generate_button.click(
-    fn=generate_audio,
-    inputs=[docx_input, speaker_dropdown, tts_engine_dropdown],
-    outputs=[download_output]
-)
 if __name__ == "__main__":
     demo.launch()

 final_audio = AudioSegment.empty()
+from pydub import AudioSegment
+from bark import generate_audio  # Importing Bark
 # Voice model
 VOICE_MODEL = "tts_models/en/vctk/vits"
 	    273: { "age": 18, "gender": "F", "accent": "English"}
 	}
+# Voice model
+VOICE_MODEL = "tts_models/en/vctk/vits"
+# Embedded metadata (from your file)
+SPEAKER_METADATA = {
+    300: { "age": 23, "gender": "F", "accent": "American"},
+    271: { "age": 19, "gender": "M", "accent": "Scottish"},
+    # More entries as before
+}
 def clean_text(text):
     # Remove hyperlinks
     return re.sub(r'http[s]?://\S+', '', text)
 def get_speaker_id_from_label(label):
     return label.split('|')[0].strip()
+# Bark Voice List (Textual Prompts)
+bark_voice_choices = [
+    "young female voice",
+    "middle-aged male voice with British accent",
+    "calm narrator",
+    "excited teenager",
+    "elderly male voice",
+    "child with American accent"
+]
+# Function to generate audio using Coqui TTS (with metadata)
+def generate_sample_audio(sample_text, speaker_label, model_choice):
     if len(sample_text) > 500:
         raise gr.Error("Sample text exceeds 500 characters.")
     speaker_id = get_speaker_id_from_label(speaker_label)
+    if model_choice == "Coqui":
+        model = TTS("tts_models/multilingual/multi-dataset/your_model")
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
+            model.tts_to_file(text=sample_text, speaker="p"+speaker_id, file_path=tmp_wav.name)
+            return tmp_wav.name
+    elif model_choice == "Bark":
+        voice_prompt = speaker_label  # Bark's speaker prompt could be a descriptive voice label
+        audio = generate_audio(sample_text, speaker_prompt=voice_prompt)  # Bark's method for audio generation
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
+            audio.export(tmp_wav.name, format="wav")
+            return tmp_wav.name
+    else:
+        model = TTS("tts_models/en/vctk/vits")
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
+            model.tts_to_file(text=sample_text, speaker="p"+speaker_id, file_path=tmp_wav.name)
+            return tmp_wav.name
+# Function to generate full audio from DOCX using selected TTS model
+def generate_audio(docx_file, speaker_label, model_choice):
     speaker_id = get_speaker_id_from_label(speaker_label)
+    if model_choice == "Coqui":
+        model = TTS("tts_models/multilingual/multi-dataset/your_model")
         paragraphs = extract_paragraphs_from_docx(docx_file)
         combined_audio = AudioSegment.empty()
         temp_files = []
         try:
             for idx, para in enumerate(paragraphs):
+                tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+                model.tts_to_file(text=para, speaker="p"+speaker_id, file_path=tmp.name)
+                audio_chunk = AudioSegment.from_wav(tmp.name)
+                combined_audio += audio_chunk
+                temp_files.append(tmp.name)
+                tmp.close()
         except Exception as e:
             print("Generation interrupted. Saving partial output.", e)
         output_dir = tempfile.mkdtemp()
         final_output_path = os.path.join(output_dir, "final_output.wav")
         combined_audio.export(final_output_path, format="wav")
         zip_path = os.path.join(output_dir, "output.zip")
         with zipfile.ZipFile(zip_path, 'w') as zipf:
+            zipf.write(final_output_path, arcname="final_output.wav")
         for f in temp_files:
+            os.remove(f)
+        return zip_path
+    elif model_choice == "Bark":
+        paragraphs = extract_paragraphs_from_docx(docx_file)
+        combined_audio = AudioSegment.empty()
+        try:
+            for para in paragraphs:
+                audio = generate_audio(para, speaker_prompt=speaker_label)  # Bark
+                combined_audio += audio  # Append audio to final output
+        except Exception as e:
+            print("Generation interrupted. Saving partial output.", e)
+        output_dir = tempfile.mkdtemp()
+        final_output_path = os.path.join(output_dir, "final_output.wav")
+        combined_audio.export(final_output_path, format="wav")
+        zip_path = os.path.join(output_dir, "output.zip")
+        with zipfile.ZipFile(zip_path, 'w') as zipf:
+            zipf.write(final_output_path, arcname="final_output.wav")
+        return zip_path
+    else:  # VCTK
+        paragraphs = extract_paragraphs_from_docx(docx_file)
+        combined_audio = AudioSegment.empty()
+        temp_files = []
+        try:
+            for idx, para in enumerate(paragraphs):
+                tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+                model = TTS("tts_models/en/vctk/vits")
+                model.tts_to_file(text=para, speaker="p"+speaker_id, file_path=tmp.name)
+                audio_chunk = AudioSegment.from_wav(tmp.name)
+                combined_audio += audio_chunk
+                temp_files.append(tmp.name)
+                tmp.close()
+        except Exception as e:
+            print("Generation interrupted. Saving partial output.", e)
+        output_dir = tempfile.mkdtemp()
+        final_output_path = os.path.join(output_dir, "final_output.wav")
+        combined_audio.export(final_output_path, format="wav")
+        zip_path = os.path.join(output_dir, "output.zip")
+        with zipfile.ZipFile(zip_path, 'w') as zipf:
+            zipf.write(final_output_path, arcname="final_output.wav")
+        for f in temp_files:
+            os.remove(f)
+        return zip_path
 # --- UI ---
 speaker_choices = list_speaker_choices()
     gr.Markdown("## 📄 TTS Voice Generator with Paragraph-Wise Processing")
     with gr.Row():
+        model_selector = gr.Dropdown(label="Select TTS Engine", choices=["Coqui", "Bark", "VCTK"], value="VCTK")
         speaker_dropdown = gr.Dropdown(label="Select Voice", choices=speaker_choices)
     with gr.Row():
         sample_textbox = gr.Textbox(label="Enter Sample Text (Max 500 characters)", max_lines=5)
         sample_button = gr.Button("Generate Sample")
         clear_button = gr.Button("Clear Sample")
     sample_audio = gr.Audio(label="Sample Output", type="filepath")
+    sample_button.click(fn=generate_sample_audio, inputs=[sample_textbox, speaker_dropdown, model_selector], outputs=[sample_audio])
     clear_button.click(fn=lambda: None, inputs=[], outputs=[sample_audio])
     with gr.Row():
         generate_button = gr.Button("Generate Full Audio")
         download_output = gr.File(label="Download Output Zip")
+    generate_button.click(fn=generate_audio, inputs=[docx_input, speaker_dropdown, model_selector], outputs=[download_output])
 if __name__ == "__main__":
     demo.launch()