Spaces:

Den4ikAI
/

ESpeech-TTS

Running on Zero

App Files Files Community

Den4ikAI commited on Aug 25

Commit

80d1f1b

verified ·

1 Parent(s): 7f51b5d

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -15

app.py CHANGED Viewed

@@ -84,6 +84,10 @@ MODEL_CFG = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_lay
 _cached_local_paths = {}
 loaded_models = {}  # хранит объекты моделей в памяти (по имени выбора)
 # ----------------- Вспомогательные функции HF -----------------
 def hf_download_file(repo_id: str, filename: str, token: str = None):
     try:
@@ -143,6 +147,22 @@ print("Loading vocoder (CPU) ...")
 vocoder = load_vocoder()
 print("Vocoder loaded.")
 # ----------------- Основная функция синтеза (GPU-aware) -----------------
 # Декорируем synthesize, чтобы при вызове Space выделял GPU (если доступно).
 # duration — сколько секунд просим GPU (адаптируйте под ваш инференс).
@@ -168,7 +188,7 @@ def synthesize(
     """
     if not ref_audio:
         gr.Warning("Please provide reference audio.")
-        return None, None, ref_text
     if seed is None or seed < 0 or seed > 2**31 - 1:
         seed = np.random.randint(0, 2**31 - 1)
@@ -176,7 +196,7 @@ def synthesize(
     if not gen_text or not gen_text.strip():
         gr.Warning("Please enter text to generate.")
-        return None, None, ref_text
     # ASR если нужно
     if not ref_text or not ref_text.strip():
@@ -195,18 +215,18 @@ def synthesize(
             gr.Info(f"ASR transcription: {ref_text}")
         except Exception as e:
             gr.Warning(f"ASR failed: {e}")
-            return None, None, ref_text
-    # Акцентирование
-    processed_ref_text = accentizer.process_all(ref_text) if ref_text and ref_text.strip() else ref_text
-    processed_gen_text = accentizer.process_all(gen_text)
     # Ленивая загрузка модели (в CPU)
     try:
         model = load_model_if_needed(model_choice)
     except Exception as e:
         gr.Warning(f"Failed to download/load model {model_choice}: {e}")
-        return None, None, ref_text
     # Определяем устройство (в ZeroGPU внутри декоратора должен быть доступен CUDA)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -230,7 +250,7 @@ def synthesize(
         # Препроцессинг рефа (оно ожидает путь/файл)
         try:
-            ref_audio_proc, processed_ref_text = preprocess_ref_audio_text(
                 ref_audio,
                 processed_ref_text,
                 show_info=gr.Info
@@ -238,13 +258,13 @@ def synthesize(
         except Exception as e:
             gr.Warning(f"Preprocess failed: {e}")
             traceback.print_exc()
-            return None, None, ref_text
         # Инференс (предполагается, что infer_process корректно работает и на GPU)
         try:
             final_wave, final_sample_rate, combined_spectrogram = infer_process(
                 ref_audio_proc,
-                processed_ref_text,
                 processed_gen_text,
                 model,
                 vocoder,
@@ -257,7 +277,7 @@ def synthesize(
         except Exception as e:
             gr.Warning(f"Infer failed: {e}")
             traceback.print_exc()
-            return None, None, ref_text
         # Удаление тишин (на CPU)
         if remove_silence:
@@ -280,7 +300,7 @@ def synthesize(
             print("Save spectrogram failed:", e)
             spectrogram_path = None
-        return (final_sample_rate, final_wave), spectrogram_path, processed_ref_text
     finally:
         # Переносим всё обратно на CPU и очищаем GPU память
@@ -301,6 +321,18 @@ def synthesize(
 with gr.Blocks(title="ESpeech-TTS") as app:
     gr.Markdown("# ESpeech-TTS")
     gr.Markdown("See more on https://huggingface.co/ESpeech")
     model_choice = gr.Dropdown(
         choices=list(MODEL_REPOS.keys()),
@@ -312,9 +344,29 @@ with gr.Blocks(title="ESpeech-TTS") as app:
     with gr.Row():
         with gr.Column():
             ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
-            ref_text_input = gr.Textbox(label="Reference Text", lines=2, placeholder="leave empty → ASR")
         with gr.Column():
-            gen_text_input = gr.Textbox(label="Text to Generate", lines=5, max_lines=20)
     with gr.Row():
         with gr.Column():
@@ -331,6 +383,37 @@ with gr.Blocks(title="ESpeech-TTS") as app:
         audio_output = gr.Audio(label="Generated Audio", type="numpy")
         spectrogram_output = gr.Image(label="Spectrogram", type="filepath")
     generate_btn.click(
         synthesize,
         inputs=[
@@ -344,7 +427,7 @@ with gr.Blocks(title="ESpeech-TTS") as app:
             nfe_slider,
             speed_slider,
         ],
-        outputs=[audio_output, spectrogram_output, ref_text_input]
     )
 if __name__ == "__main__":

 _cached_local_paths = {}
 loaded_models = {}  # хранит объекты моделей в памяти (по имени выбора)
+# Пример текста для демонстрации
+EXAMPLE_TEXT = "Экспериментальный центр напоминает вам о том, что кубы не умеют разговаривать. В случае, если грузовой куб все же заговорит, центр настоятельно рекомендует вам игнорировать его советы."
+EXAMPLE_REF_AUDIO = "ref/example.mp3"
 # ----------------- Вспомогательные функции HF -----------------
 def hf_download_file(repo_id: str, filename: str, token: str = None):
     try:
 vocoder = load_vocoder()
 print("Vocoder loaded.")
+# ----------------- Функция для обработки текста с учетом "+" -----------------
+def process_text_with_accent(text, accentizer):
+    """
+    Обрабатывает текст через RUAccent, если в нем нет символа '+'.
+    Если есть '+' - пользователь сам проставил ударения, не трогаем.
+    """
+    if not text or not text.strip():
+        return text
+    if '+' in text:
+        # Пользователь сам проставил ударения
+        return text
+    else:
+        # Прогоняем через RUAccent
+        return accentizer.process_all(text)
 # ----------------- Основная функция синтеза (GPU-aware) -----------------
 # Декорируем synthesize, чтобы при вызове Space выделял GPU (если доступно).
 # duration — сколько секунд просим GPU (адаптируйте под ваш инференс).
     """
     if not ref_audio:
         gr.Warning("Please provide reference audio.")
+        return None, None, ref_text, gen_text
     if seed is None or seed < 0 or seed > 2**31 - 1:
         seed = np.random.randint(0, 2**31 - 1)
     if not gen_text or not gen_text.strip():
         gr.Warning("Please enter text to generate.")
+        return None, None, ref_text, gen_text
     # ASR если нужно
     if not ref_text or not ref_text.strip():
             gr.Info(f"ASR transcription: {ref_text}")
         except Exception as e:
             gr.Warning(f"ASR failed: {e}")
+            return None, None, ref_text, gen_text
+    # Акцентирование с учетом наличия символа "+"
+    processed_ref_text = process_text_with_accent(ref_text, accentizer)
+    processed_gen_text = process_text_with_accent(gen_text, accentizer)
     # Ленивая загрузка модели (в CPU)
     try:
         model = load_model_if_needed(model_choice)
     except Exception as e:
         gr.Warning(f"Failed to download/load model {model_choice}: {e}")
+        return None, None, processed_ref_text, processed_gen_text
     # Определяем устройство (в ZeroGPU внутри декоратора должен быть доступен CUDA)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         # Препроцессинг рефа (оно ожидает путь/файл)
         try:
+            ref_audio_proc, processed_ref_text_final = preprocess_ref_audio_text(
                 ref_audio,
                 processed_ref_text,
                 show_info=gr.Info
         except Exception as e:
             gr.Warning(f"Preprocess failed: {e}")
             traceback.print_exc()
+            return None, None, processed_ref_text, processed_gen_text
         # Инференс (предполагается, что infer_process корректно работает и на GPU)
         try:
             final_wave, final_sample_rate, combined_spectrogram = infer_process(
                 ref_audio_proc,
+                processed_ref_text_final,
                 processed_gen_text,
                 model,
                 vocoder,
         except Exception as e:
             gr.Warning(f"Infer failed: {e}")
             traceback.print_exc()
+            return None, None, processed_ref_text, processed_gen_text
         # Удаление тишин (на CPU)
         if remove_silence:
             print("Save spectrogram failed:", e)
             spectrogram_path = None
+        return (final_sample_rate, final_wave), spectrogram_path, processed_ref_text_final, processed_gen_text
     finally:
         # Переносим всё обратно на CPU и очищаем GPU память
 with gr.Blocks(title="ESpeech-TTS") as app:
     gr.Markdown("# ESpeech-TTS")
     gr.Markdown("See more on https://huggingface.co/ESpeech")
+    gr.Markdown("💡 **Tip:** Add '+' symbol in text to mark custom stress (e.g., 'прив+ет'). Text with '+' won't be processed by RUAccent.")
+    # Описание моделей на русском языке
+    gr.Markdown("""
+    ## 📋 Описание моделей:
+    - **ESpeech-TTS-1 [RL] V1** - Первая версия модели с RL
+    - **ESpeech-TTS-1 [RL] V2** - Вторая версия модели с RL
+    - **ESpeech-TTS-1 PODCASTER [SFT]** - Модель обученная только на подкастах, лучше генерирует спонтанную речь
+    - **ESpeech-TTS-1 [SFT] 95K** - чекпоинт с 95000 шагов (на нем основана RL V1)
+    - **ESpeech-TTS-1 [SFT] 265K** - чекпоинт с 265000 шагов (на нем основана RL V2)
+    """)
     model_choice = gr.Dropdown(
         choices=list(MODEL_REPOS.keys()),
     with gr.Row():
         with gr.Column():
             ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
+            ref_text_input = gr.Textbox(
+                label="Reference Text",
+                lines=2,
+                placeholder="leave empty → ASR will transcribe"
+            )
+            ref_text_output = gr.Textbox(
+                label="Processed Reference Text (with accents)",
+                lines=2,
+                interactive=False
+            )
         with gr.Column():
+            gen_text_input = gr.Textbox(
+                label="Text to Generate",
+                lines=5,
+                max_lines=20,
+                placeholder="Enter text to synthesize..."
+            )
+            gen_text_output = gr.Textbox(
+                label="Processed Text to Generate (with accents)",
+                lines=5,
+                max_lines=20,
+                interactive=False
+            )
     with gr.Row():
         with gr.Column():
         audio_output = gr.Audio(label="Generated Audio", type="numpy")
         spectrogram_output = gr.Image(label="Spectrogram", type="filepath")
+    # Примеры
+    gr.Markdown("## 🎯 Example")
+    gr.Examples(
+        examples=[
+            [
+                EXAMPLE_REF_AUDIO,  # ref_audio
+                "",  # ref_text (empty for ASR)
+                EXAMPLE_TEXT,  # gen_text
+                False,  # remove_silence
+                42,  # seed
+                0.15,  # cross_fade
+                48,  # nfe_step
+                1.0,  # speed
+            ]
+        ],
+        inputs=[
+            ref_audio_input,
+            ref_text_input,
+            gen_text_input,
+            remove_silence,
+            seed_input,
+            cross_fade_slider,
+            nfe_slider,
+            speed_slider,
+        ],
+        outputs=[audio_output, spectrogram_output, ref_text_output, gen_text_output],
+        fn=lambda *args: synthesize(model_choice.value, *args),
+        cache_examples=True,
+        run_on_click=True,
+    )
     generate_btn.click(
         synthesize,
         inputs=[
             nfe_slider,
             speed_slider,
         ],
+        outputs=[audio_output, spectrogram_output, ref_text_output, gen_text_output]
     )
 if __name__ == "__main__":