Spaces:

miya3333
/

speechbrain-tts-api-test-miya333

Running

App Files Files Community

miya3333 commited on Jul 3

Commit

aca07cd

verified ·

1 Parent(s): 998ecb9

Upload 4 files

Browse files

Files changed (4) hide show

README.md +139 -6
app.py +135 -0
client_test.py +140 -0
requirements.txt +10 -0

README.md CHANGED Viewed

@@ -1,13 +1,146 @@
 ---
-title: Speechbrain-tts-api-test-miya333 CS4
-emoji: 📊
-colorFrom: indigo
-colorTo: gray
 sdk: gradio
-sdk_version: 5.35.0
 app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: SpeechBrain TTS API
+emoji: 🗣️
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 license: mit
 ---
+# SpeechBrain TTS API
+A Text-to-Speech API built with FastAPI and SpeechBrain, running on Hugging Face Spaces.
+## Features
+- 🎯 **Fast TTS synthesis** using SpeechBrain's Tacotron2 + HiFiGAN
+- 🔄 **Multiple output formats** (WAV stream and Base64)
+- 📝 **Simple REST API** with JSON requests
+- 🚀 **Ready for deployment** on Hugging Face Spaces
+## API Endpoints
+### `GET /`
+Returns API status message.
+### `GET /health`
+Health check endpoint.
+### `POST /synthesize`
+Synthesizes speech and returns audio as WAV stream.
+**Request:**
+```json
+{
+  "text": "Hello, this is a test message",
+  "sample_rate": 22050
+}
+```
+**Response:** WAV audio file stream
+### `POST /synthesize_base64`
+Synthesizes speech and returns audio as Base64 encoded string.
+**Request:**
+```json
+{
+  "text": "Hello, this is a test message",
+  "sample_rate": 22050
+}
+```
+**Response:**
+```json
+{
+  "audio_base64": "UklGRkq...",
+  "sample_rate": 22050,
+  "text": "Hello, this is a test message"
+}
+```
+## Usage Examples
+### Python Client
+```python
+import requests
+import base64
+from io import BytesIO
+# Text to synthesize
+text = "Hello world, this is a speech synthesis test."
+# Request to Base64 endpoint
+response = requests.post(
+    "https://your-space-url.hf.space/synthesize_base64",
+    json={"text": text, "sample_rate": 22050}
+)
+if response.status_code == 200:
+    result = response.json()
+    # Decode Base64 audio
+    audio_data = base64.b64decode(result["audio_base64"])
+    # Save as WAV file
+    with open("output.wav", "wb") as f:
+        f.write(audio_data)
+    print("Audio saved as output.wav")
+```
+### JavaScript Client
+```javascript
+async function synthesizeSpeech(text) {
+    const response = await fetch('/synthesize_base64', {
+        method: 'POST',
+        headers: {
+            'Content-Type': 'application/json',
+        },
+        body: JSON.stringify({
+            text: text,
+            sample_rate: 22050
+        })
+    });
+    if (response.ok) {
+        const result = await response.json();
+        // Create audio element
+        const audio = new Audio();
+        audio.src = `data:audio/wav;base64,${result.audio_base64}`;
+        audio.play();
+    }
+}
+// Usage
+synthesizeSpeech("Hello from JavaScript!");
+```
+## Local Development
+1. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+2. Run the server:
+```bash
+python app.py
+```
+3. The API will be available at `http://localhost:7860`
+## Model Information
+- **TTS Model:** SpeechBrain Tacotron2 (LJSpeech)
+- **Vocoder:** SpeechBrain HiFiGAN (LJSpeech)
+- **Default Sample Rate:** 22,050 Hz
+- **Text Limit:** 500 characters per request
+## License
+MIT License

app.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+import torch
+import torchaudio
+from speechbrain.pretrained import Tacotron2
+from speechbrain.pretrained import HIFIGAN
+import io
+import numpy as np
+import tempfile
+import os
+app = FastAPI(title="SpeechBrain TTS API", description="Text-to-Speech API using SpeechBrain")
+class TTSRequest(BaseModel):
+    text: str
+    sample_rate: int = 22050
+class TTSService:
+    def __init__(self):
+        self.tacotron2 = None
+        self.hifi_gan = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+    def load_models(self):
+        if self.tacotron2 is None:
+            print("Loading Tacotron2 model...")
+            self.tacotron2 = Tacotron2.from_hparams(
+                source="speechbrain/tts-tacotron2-ljspeech",
+                savedir="tmpdir_tts"
+            )
+        if self.hifi_gan is None:
+            print("Loading HiFi-GAN vocoder...")
+            self.hifi_gan = HIFIGAN.from_hparams(
+                source="speechbrain/tts-hifigan-ljspeech",
+                savedir="tmpdir_vocoder"
+            )
+    def synthesize(self, text: str, sample_rate: int = 22050):
+        self.load_models()
+        mel_output, mel_length, alignment = self.tacotron2.encode_text(text)
+        waveforms = self.hifi_gan.decode_batch(mel_output)
+        audio_np = waveforms.squeeze().cpu().numpy()
+        if sample_rate != 22050:
+            import librosa
+            audio_np = librosa.resample(audio_np, orig_sr=22050, target_sr=sample_rate)
+        return audio_np, sample_rate
+tts_service = TTSService()
+@app.get("/")
+async def root():
+    return {"message": "SpeechBrain TTS API is running!"}
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
+@app.post("/synthesize")
+async def synthesize_speech(request: TTSRequest):
+    try:
+        if not request.text or len(request.text.strip()) == 0:
+            raise HTTPException(status_code=400, detail="Text cannot be empty")
+        if len(request.text) > 500:
+            raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
+        audio_data, sample_rate = tts_service.synthesize(request.text, request.sample_rate)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            torchaudio.save(
+                tmp_file.name,
+                torch.tensor(audio_data).unsqueeze(0),
+                sample_rate
+            )
+            with open(tmp_file.name, "rb") as audio_file:
+                audio_bytes = audio_file.read()
+            os.unlink(tmp_file.name)
+        return StreamingResponse(
+            io.BytesIO(audio_bytes),
+            media_type="audio/wav",
+            headers={"Content-Disposition": "attachment; filename=synthesized_audio.wav"}
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
+@app.post("/synthesize_base64")
+async def synthesize_speech_base64(request: TTSRequest):
+    import base64
+    try:
+        if not request.text or len(request.text.strip()) == 0:
+            raise HTTPException(status_code=400, detail="Text cannot be empty")
+        if len(request.text) > 500:
+            raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
+        audio_data, sample_rate = tts_service.synthesize(request.text, request.sample_rate)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            torchaudio.save(
+                tmp_file.name,
+                torch.tensor(audio_data).unsqueeze(0),
+                sample_rate
+            )
+            with open(tmp_file.name, "rb") as audio_file:
+                audio_bytes = audio_file.read()
+            os.unlink(tmp_file.name)
+        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+        return {
+            "audio_base64": audio_base64,
+            "sample_rate": sample_rate,
+            "text": request.text
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

client_test.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import requests
+import base64
+import json
+from pathlib import Path
+class TTSClient:
+    def __init__(self, base_url="http://localhost:7860"):
+        self.base_url = base_url.rstrip("/")
+    def health_check(self):
+        """APIの健康状態をチェック"""
+        try:
+            response = requests.get(f"{self.base_url}/health")
+            return response.status_code == 200, response.json()
+        except Exception as e:
+            return False, str(e)
+    def synthesize_to_file(self, text, output_path="output.wav", sample_rate=22050):
+        """テキストを音声に変換してファイルに保存"""
+        try:
+            response = requests.post(
+                f"{self.base_url}/synthesize",
+                json={"text": text, "sample_rate": sample_rate}
+            )
+            if response.status_code == 200:
+                with open(output_path, "wb") as f:
+                    f.write(response.content)
+                return True, f"Audio saved to {output_path}"
+            else:
+                return False, f"Error: {response.status_code} - {response.text}"
+        except Exception as e:
+            return False, str(e)
+    def synthesize_to_base64(self, text, sample_rate=22050):
+        """テキストを音声に変換してBase64形式で取得"""
+        try:
+            response = requests.post(
+                f"{self.base_url}/synthesize_base64",
+                json={"text": text, "sample_rate": sample_rate}
+            )
+            if response.status_code == 200:
+                return True, response.json()
+            else:
+                return False, f"Error: {response.status_code} - {response.text}"
+        except Exception as e:
+            return False, str(e)
+    def save_base64_audio(self, audio_base64, output_path="output_from_base64.wav"):
+        """Base64形式の音声データをファイルに保存"""
+        try:
+            audio_data = base64.b64decode(audio_base64)
+            with open(output_path, "wb") as f:
+                f.write(audio_data)
+            return True, f"Audio saved to {output_path}"
+        except Exception as e:
+            return False, str(e)
+def main():
+    # クライアントを初期化（ローカル開発用）
+    client = TTSClient("http://localhost:7860")
+    # Hugging Face Spacesデプロイ用（URLを適切に変更してください）
+    # client = TTSClient("https://your-space-name.hf.space")
+    print("=== SpeechBrain TTS API Client Test ===\n")
+    # 健康状態チェック
+    print("1. Health Check:")
+    is_healthy, health_result = client.health_check()
+    print(f"   Status: {'✓ Healthy' if is_healthy else '✗ Unhealthy'}")
+    print(f"   Response: {health_result}\n")
+    if not is_healthy:
+        print("API is not available. Please check if the server is running.")
+        return
+    # テスト用テキスト
+    test_texts = [
+        "Hello, this is a test of the SpeechBrain TTS API.",
+        "The quick brown fox jumps over the lazy dog.",
+        "Welcome to Hugging Face Spaces!"
+    ]
+    # テスト1: WAVファイル直接保存
+    print("2. Testing direct WAV file synthesis:")
+    for i, text in enumerate(test_texts):
+        print(f"   Testing: '{text}'")
+        success, result = client.synthesize_to_file(
+            text,
+            f"test_output_{i+1}.wav",
+            sample_rate=22050
+        )
+        print(f"   Result: {'✓ Success' if success else '✗ Failed'} - {result}")
+    print()
+    # テスト2: Base64形式での取得
+    print("3. Testing Base64 synthesis:")
+    for i, text in enumerate(test_texts):
+        print(f"   Testing: '{text}'")
+        success, result = client.synthesize_to_base64(text, sample_rate=22050)
+        if success:
+            print(f"   ✓ Success - Audio length: {len(result['audio_base64'])} chars")
+            # Base64からファイルに保存
+            save_success, save_result = client.save_base64_audio(
+                result['audio_base64'],
+                f"test_base64_{i+1}.wav"
+            )
+            print(f"   Save result: {'✓ Success' if save_success else '✗ Failed'} - {save_result}")
+        else:
+            print(f"   ✗ Failed - {result}")
+    print()
+    # テスト3: エラーハンドリング
+    print("4. Testing error handling:")
+    # 空のテキスト
+    print("   Testing empty text:")
+    success, result = client.synthesize_to_file("", "empty_test.wav")
+    print(f"   Result: {'✓ Success' if success else '✗ Expected failure'} - {result}")
+    # 長すぎるテキスト
+    print("   Testing too long text:")
+    long_text = "This is a very long text. " * 50  # 500文字以上
+    success, result = client.synthesize_to_file(long_text, "long_test.wav")
+    print(f"   Result: {'✓ Success' if success else '✗ Expected failure'} - {result}")
+    print("\n=== Test Complete ===")
+    print("Check the generated audio files:")
+    for i in range(len(test_texts)):
+        print(f"   - test_output_{i+1}.wav")
+        print(f"   - test_base64_{i+1}.wav")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+speechbrain==0.5.16
+torch>=1.13.0
+torchaudio>=0.13.0
+numpy>=1.21.0
+scipy>=1.7.0
+librosa>=0.9.0
+soundfile>=0.12.1
+python-multipart==0.0.6