Spaces:

Ryanus
/

fastspeech2

Running

App Files Files Community

Ryanus commited on Jun 30

Commit

9a15bbe

verified ·

1 Parent(s): 9d04103

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -24

app.py CHANGED Viewed

@@ -7,9 +7,6 @@ import os
 import torch
 # 載入 FastSpeech2 模型
-# SpeechBrain 提供了預訓練的 FastSpeech2 模型，可以直接從 Hugging Face Hub 載入。
-# savedir 參數指定了模型下載後儲存的本地路徑。
-# run_opts={"device":"cpu"} 確保模型在 CPU 上運行。
 fastspeech2 = FastSpeech2.from_hparams(
     source="speechbrain/tts-fastspeech2-ljspeech",
     savedir="pretrained_models/tts-fastspeech2-ljspeech",
@@ -17,8 +14,6 @@ fastspeech2 = FastSpeech2.from_hparams(
 )
 # 載入聲碼器 (Vocoder)
-# FastSpeech2 輸出的是梅爾頻譜圖 (mel-spectrogram)，需要聲碼器將其轉換為可聽的音訊。
-# 我們使用 HiFi-GAN 作為聲碼器。
 hifi_gan = HIFIGAN.from_hparams(
     source="speechbrain/tts-hifigan-ljspeech",
     savedir="pretrained_models/tts-hifigan-ljspeech",
@@ -29,36 +24,47 @@ def synthesize_speech(text):
     """
     將輸入文字轉換為語音。
     """
-    if not text:
-        return None
-    # 將文字編碼為梅爾頻譜圖
-    # pace 參數可以調整語音的速度，1.0 是正常速度。
-    mel_outputs, durations, pitch, energy = fastspeech2.encode_text(
-        [text], pace=1.0
-    )
-    # 使用聲碼器將梅爾頻譜圖轉換為音訊波形
-    # squeeze=True 確保輸出是單聲道。
-    wav = hifi_gan.decode_batch(mel_outputs).squeeze(1)
-    # 將音訊張量轉換為 NumPy 陣列
-    audio_numpy = wav.cpu().numpy().flatten()
-    # 定義輸出檔案路徑
-    output_file = "output.wav"
-    # 將音訊保存為 WAV 檔案
-    # 採樣率為 16000 Hz，這是 SpeechBrain 模型的預設採樣率。
-    torchaudio.save(output_file, torch.tensor(audio_numpy).unsqueeze(0), 16000)
-    return output_file
 # 創建 Gradio 介面
 iface = gr.Interface(
     fn=synthesize_speech,
     inputs=gr.Textbox(lines=2, placeholder="請輸入您想要合成的文字..."),
-    outputs=gr.Audio(type="filepath", label="合成語音"),
     title="FastSpeech2 文字轉語音 (CPU)",
     description="這是一個使用 SpeechBrain 的 FastSpeech2 模型在 Hugging Face Spaces 的 CPU 上進行文字轉語音的演示。由於在 CPU 上運行，合成速度可能會較慢。",
     examples=[

 import torch
 # 載入 FastSpeech2 模型
 fastspeech2 = FastSpeech2.from_hparams(
     source="speechbrain/tts-fastspeech2-ljspeech",
     savedir="pretrained_models/tts-fastspeech2-ljspeech",
 )
 # 載入聲碼器 (Vocoder)
 hifi_gan = HIFIGAN.from_hparams(
     source="speechbrain/tts-hifigan-ljspeech",
     savedir="pretrained_models/tts-hifigan-ljspeech",
     """
     將輸入文字轉換為語音。
     """
+    # 檢查輸入文字是否為空或只包含空白字元
+    if not text or text.strip() == "":
+        # 返回一個錯誤訊息或空音訊，而不是直接 None
+        # Gradio 介面會顯示這個錯誤訊息
+        return None, "請輸入有效的文字進行語音合成。"
+    try:
+        # 將文字編碼為梅爾頻譜圖
+        mel_outputs, durations, pitch, energy = fastspeech2.encode_text(
+            [text], pace=1.0
+        )
+        # 使用聲碼器將梅爾頻譜圖轉換為音訊波形
+        wav = hifi_gan.decode_batch(mel_outputs).squeeze(1)
+        # 將音訊張量轉換為 NumPy 陣列
+        audio_numpy = wav.cpu().numpy().flatten()
+        # 定義輸出檔案路徑
+        output_file = "output.wav"
+        # 將音訊保存為 WAV 檔案
+        torchaudio.save(output_file, torch.tensor(audio_numpy).unsqueeze(0), 16000)
+        return output_file, "語音合成成功！"
+    except IndexError as e:
+        # 捕獲特定的 IndexError，並提供更詳細的錯誤訊息
+        return None, f"語音合成失敗：處理文字時發生錯誤 (IndexError)。請嘗試不同的文字。錯誤詳情: {e}"
+    except Exception as e:
+        # 捕獲其他所有可能的錯誤
+        return None, f"語音合成失敗：發生未知錯誤。錯誤詳情: {e}"
 # 創建 Gradio 介面
 iface = gr.Interface(
     fn=synthesize_speech,
     inputs=gr.Textbox(lines=2, placeholder="請輸入您想要合成的文字..."),
+    outputs=[
+        gr.Audio(type="filepath", label="合成語音"),
+        gr.Textbox(label="狀態訊息") # 新增一個文字框來顯示狀態或錯誤訊息
+    ],
     title="FastSpeech2 文字轉語音 (CPU)",
     description="這是一個使用 SpeechBrain 的 FastSpeech2 模型在 Hugging Face Spaces 的 CPU 上進行文字轉語音的演示。由於在 CPU 上運行，合成速度可能會較慢。",
     examples=[