miya3333 commited on
Commit
aca07cd
·
verified ·
1 Parent(s): 998ecb9

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +139 -6
  2. app.py +135 -0
  3. client_test.py +140 -0
  4. requirements.txt +10 -0
README.md CHANGED
@@ -1,13 +1,146 @@
1
  ---
2
- title: Speechbrain-tts-api-test-miya333 CS4
3
- emoji: 📊
4
- colorFrom: indigo
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 5.35.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: SpeechBrain TTS API
3
+ emoji: 🗣️
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
+ # SpeechBrain TTS API
14
+
15
+ A Text-to-Speech API built with FastAPI and SpeechBrain, running on Hugging Face Spaces.
16
+
17
+ ## Features
18
+
19
+ - 🎯 **Fast TTS synthesis** using SpeechBrain's Tacotron2 + HiFiGAN
20
+ - 🔄 **Multiple output formats** (WAV stream and Base64)
21
+ - 📝 **Simple REST API** with JSON requests
22
+ - 🚀 **Ready for deployment** on Hugging Face Spaces
23
+
24
+ ## API Endpoints
25
+
26
+ ### `GET /`
27
+ Returns API status message.
28
+
29
+ ### `GET /health`
30
+ Health check endpoint.
31
+
32
+ ### `POST /synthesize`
33
+ Synthesizes speech and returns audio as WAV stream.
34
+
35
+ **Request:**
36
+ ```json
37
+ {
38
+ "text": "Hello, this is a test message",
39
+ "sample_rate": 22050
40
+ }
41
+ ```
42
+
43
+ **Response:** WAV audio file stream
44
+
45
+ ### `POST /synthesize_base64`
46
+ Synthesizes speech and returns audio as Base64 encoded string.
47
+
48
+ **Request:**
49
+ ```json
50
+ {
51
+ "text": "Hello, this is a test message",
52
+ "sample_rate": 22050
53
+ }
54
+ ```
55
+
56
+ **Response:**
57
+ ```json
58
+ {
59
+ "audio_base64": "UklGRkq...",
60
+ "sample_rate": 22050,
61
+ "text": "Hello, this is a test message"
62
+ }
63
+ ```
64
+
65
+ ## Usage Examples
66
+
67
+ ### Python Client
68
+ ```python
69
+ import requests
70
+ import base64
71
+ from io import BytesIO
72
+
73
+ # Text to synthesize
74
+ text = "Hello world, this is a speech synthesis test."
75
+
76
+ # Request to Base64 endpoint
77
+ response = requests.post(
78
+ "https://your-space-url.hf.space/synthesize_base64",
79
+ json={"text": text, "sample_rate": 22050}
80
+ )
81
+
82
+ if response.status_code == 200:
83
+ result = response.json()
84
+
85
+ # Decode Base64 audio
86
+ audio_data = base64.b64decode(result["audio_base64"])
87
+
88
+ # Save as WAV file
89
+ with open("output.wav", "wb") as f:
90
+ f.write(audio_data)
91
+
92
+ print("Audio saved as output.wav")
93
+ ```
94
+
95
+ ### JavaScript Client
96
+ ```javascript
97
+ async function synthesizeSpeech(text) {
98
+ const response = await fetch('/synthesize_base64', {
99
+ method: 'POST',
100
+ headers: {
101
+ 'Content-Type': 'application/json',
102
+ },
103
+ body: JSON.stringify({
104
+ text: text,
105
+ sample_rate: 22050
106
+ })
107
+ });
108
+
109
+ if (response.ok) {
110
+ const result = await response.json();
111
+
112
+ // Create audio element
113
+ const audio = new Audio();
114
+ audio.src = `data:audio/wav;base64,${result.audio_base64}`;
115
+ audio.play();
116
+ }
117
+ }
118
+
119
+ // Usage
120
+ synthesizeSpeech("Hello from JavaScript!");
121
+ ```
122
+
123
+ ## Local Development
124
+
125
+ 1. Install dependencies:
126
+ ```bash
127
+ pip install -r requirements.txt
128
+ ```
129
+
130
+ 2. Run the server:
131
+ ```bash
132
+ python app.py
133
+ ```
134
+
135
+ 3. The API will be available at `http://localhost:7860`
136
+
137
+ ## Model Information
138
+
139
+ - **TTS Model:** SpeechBrain Tacotron2 (LJSpeech)
140
+ - **Vocoder:** SpeechBrain HiFiGAN (LJSpeech)
141
+ - **Default Sample Rate:** 22,050 Hz
142
+ - **Text Limit:** 500 characters per request
143
+
144
+ ## License
145
+
146
+ MIT License
app.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.responses import StreamingResponse
3
+ from pydantic import BaseModel
4
+ import torch
5
+ import torchaudio
6
+ from speechbrain.pretrained import Tacotron2
7
+ from speechbrain.pretrained import HIFIGAN
8
+ import io
9
+ import numpy as np
10
+ import tempfile
11
+ import os
12
+
13
+ app = FastAPI(title="SpeechBrain TTS API", description="Text-to-Speech API using SpeechBrain")
14
+
15
+ class TTSRequest(BaseModel):
16
+ text: str
17
+ sample_rate: int = 22050
18
+
19
+ class TTSService:
20
+ def __init__(self):
21
+ self.tacotron2 = None
22
+ self.hifi_gan = None
23
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
24
+
25
+ def load_models(self):
26
+ if self.tacotron2 is None:
27
+ print("Loading Tacotron2 model...")
28
+ self.tacotron2 = Tacotron2.from_hparams(
29
+ source="speechbrain/tts-tacotron2-ljspeech",
30
+ savedir="tmpdir_tts"
31
+ )
32
+
33
+ if self.hifi_gan is None:
34
+ print("Loading HiFi-GAN vocoder...")
35
+ self.hifi_gan = HIFIGAN.from_hparams(
36
+ source="speechbrain/tts-hifigan-ljspeech",
37
+ savedir="tmpdir_vocoder"
38
+ )
39
+
40
+ def synthesize(self, text: str, sample_rate: int = 22050):
41
+ self.load_models()
42
+
43
+ mel_output, mel_length, alignment = self.tacotron2.encode_text(text)
44
+
45
+ waveforms = self.hifi_gan.decode_batch(mel_output)
46
+
47
+ audio_np = waveforms.squeeze().cpu().numpy()
48
+
49
+ if sample_rate != 22050:
50
+ import librosa
51
+ audio_np = librosa.resample(audio_np, orig_sr=22050, target_sr=sample_rate)
52
+
53
+ return audio_np, sample_rate
54
+
55
+ tts_service = TTSService()
56
+
57
+ @app.get("/")
58
+ async def root():
59
+ return {"message": "SpeechBrain TTS API is running!"}
60
+
61
+ @app.get("/health")
62
+ async def health_check():
63
+ return {"status": "healthy"}
64
+
65
+ @app.post("/synthesize")
66
+ async def synthesize_speech(request: TTSRequest):
67
+ try:
68
+ if not request.text or len(request.text.strip()) == 0:
69
+ raise HTTPException(status_code=400, detail="Text cannot be empty")
70
+
71
+ if len(request.text) > 500:
72
+ raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
73
+
74
+ audio_data, sample_rate = tts_service.synthesize(request.text, request.sample_rate)
75
+
76
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
77
+ torchaudio.save(
78
+ tmp_file.name,
79
+ torch.tensor(audio_data).unsqueeze(0),
80
+ sample_rate
81
+ )
82
+
83
+ with open(tmp_file.name, "rb") as audio_file:
84
+ audio_bytes = audio_file.read()
85
+
86
+ os.unlink(tmp_file.name)
87
+
88
+ return StreamingResponse(
89
+ io.BytesIO(audio_bytes),
90
+ media_type="audio/wav",
91
+ headers={"Content-Disposition": "attachment; filename=synthesized_audio.wav"}
92
+ )
93
+
94
+ except Exception as e:
95
+ raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
96
+
97
+ @app.post("/synthesize_base64")
98
+ async def synthesize_speech_base64(request: TTSRequest):
99
+ import base64
100
+
101
+ try:
102
+ if not request.text or len(request.text.strip()) == 0:
103
+ raise HTTPException(status_code=400, detail="Text cannot be empty")
104
+
105
+ if len(request.text) > 500:
106
+ raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
107
+
108
+ audio_data, sample_rate = tts_service.synthesize(request.text, request.sample_rate)
109
+
110
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
111
+ torchaudio.save(
112
+ tmp_file.name,
113
+ torch.tensor(audio_data).unsqueeze(0),
114
+ sample_rate
115
+ )
116
+
117
+ with open(tmp_file.name, "rb") as audio_file:
118
+ audio_bytes = audio_file.read()
119
+
120
+ os.unlink(tmp_file.name)
121
+
122
+ audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
123
+
124
+ return {
125
+ "audio_base64": audio_base64,
126
+ "sample_rate": sample_rate,
127
+ "text": request.text
128
+ }
129
+
130
+ except Exception as e:
131
+ raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
132
+
133
+ if __name__ == "__main__":
134
+ import uvicorn
135
+ uvicorn.run(app, host="0.0.0.0", port=7860)
client_test.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import base64
3
+ import json
4
+ from pathlib import Path
5
+
6
+ class TTSClient:
7
+ def __init__(self, base_url="http://localhost:7860"):
8
+ self.base_url = base_url.rstrip("/")
9
+
10
+ def health_check(self):
11
+ """APIの健康状態をチェック"""
12
+ try:
13
+ response = requests.get(f"{self.base_url}/health")
14
+ return response.status_code == 200, response.json()
15
+ except Exception as e:
16
+ return False, str(e)
17
+
18
+ def synthesize_to_file(self, text, output_path="output.wav", sample_rate=22050):
19
+ """テキストを音声に変換してファイルに保存"""
20
+ try:
21
+ response = requests.post(
22
+ f"{self.base_url}/synthesize",
23
+ json={"text": text, "sample_rate": sample_rate}
24
+ )
25
+
26
+ if response.status_code == 200:
27
+ with open(output_path, "wb") as f:
28
+ f.write(response.content)
29
+ return True, f"Audio saved to {output_path}"
30
+ else:
31
+ return False, f"Error: {response.status_code} - {response.text}"
32
+
33
+ except Exception as e:
34
+ return False, str(e)
35
+
36
+ def synthesize_to_base64(self, text, sample_rate=22050):
37
+ """テキストを音声に変換してBase64形式で取得"""
38
+ try:
39
+ response = requests.post(
40
+ f"{self.base_url}/synthesize_base64",
41
+ json={"text": text, "sample_rate": sample_rate}
42
+ )
43
+
44
+ if response.status_code == 200:
45
+ return True, response.json()
46
+ else:
47
+ return False, f"Error: {response.status_code} - {response.text}"
48
+
49
+ except Exception as e:
50
+ return False, str(e)
51
+
52
+ def save_base64_audio(self, audio_base64, output_path="output_from_base64.wav"):
53
+ """Base64形式の音声データをファイルに保存"""
54
+ try:
55
+ audio_data = base64.b64decode(audio_base64)
56
+ with open(output_path, "wb") as f:
57
+ f.write(audio_data)
58
+ return True, f"Audio saved to {output_path}"
59
+ except Exception as e:
60
+ return False, str(e)
61
+
62
+ def main():
63
+ # クライアントを初期化(ローカル開発用)
64
+ client = TTSClient("http://localhost:7860")
65
+
66
+ # Hugging Face Spacesデプロイ用(URLを適切に変更してください)
67
+ # client = TTSClient("https://your-space-name.hf.space")
68
+
69
+ print("=== SpeechBrain TTS API Client Test ===\n")
70
+
71
+ # 健康状態チェック
72
+ print("1. Health Check:")
73
+ is_healthy, health_result = client.health_check()
74
+ print(f" Status: {'✓ Healthy' if is_healthy else '✗ Unhealthy'}")
75
+ print(f" Response: {health_result}\n")
76
+
77
+ if not is_healthy:
78
+ print("API is not available. Please check if the server is running.")
79
+ return
80
+
81
+ # テスト用テキスト
82
+ test_texts = [
83
+ "Hello, this is a test of the SpeechBrain TTS API.",
84
+ "The quick brown fox jumps over the lazy dog.",
85
+ "Welcome to Hugging Face Spaces!"
86
+ ]
87
+
88
+ # テスト1: WAVファイル直接保存
89
+ print("2. Testing direct WAV file synthesis:")
90
+ for i, text in enumerate(test_texts):
91
+ print(f" Testing: '{text}'")
92
+ success, result = client.synthesize_to_file(
93
+ text,
94
+ f"test_output_{i+1}.wav",
95
+ sample_rate=22050
96
+ )
97
+ print(f" Result: {'✓ Success' if success else '✗ Failed'} - {result}")
98
+ print()
99
+
100
+ # テスト2: Base64形式での取得
101
+ print("3. Testing Base64 synthesis:")
102
+ for i, text in enumerate(test_texts):
103
+ print(f" Testing: '{text}'")
104
+ success, result = client.synthesize_to_base64(text, sample_rate=22050)
105
+
106
+ if success:
107
+ print(f" ✓ Success - Audio length: {len(result['audio_base64'])} chars")
108
+
109
+ # Base64からファイルに保存
110
+ save_success, save_result = client.save_base64_audio(
111
+ result['audio_base64'],
112
+ f"test_base64_{i+1}.wav"
113
+ )
114
+ print(f" Save result: {'✓ Success' if save_success else '✗ Failed'} - {save_result}")
115
+ else:
116
+ print(f" ✗ Failed - {result}")
117
+ print()
118
+
119
+ # テスト3: エラーハンドリング
120
+ print("4. Testing error handling:")
121
+
122
+ # 空のテキスト
123
+ print(" Testing empty text:")
124
+ success, result = client.synthesize_to_file("", "empty_test.wav")
125
+ print(f" Result: {'✓ Success' if success else '✗ Expected failure'} - {result}")
126
+
127
+ # 長すぎるテキスト
128
+ print(" Testing too long text:")
129
+ long_text = "This is a very long text. " * 50 # 500文字以上
130
+ success, result = client.synthesize_to_file(long_text, "long_test.wav")
131
+ print(f" Result: {'✓ Success' if success else '✗ Expected failure'} - {result}")
132
+
133
+ print("\n=== Test Complete ===")
134
+ print("Check the generated audio files:")
135
+ for i in range(len(test_texts)):
136
+ print(f" - test_output_{i+1}.wav")
137
+ print(f" - test_base64_{i+1}.wav")
138
+
139
+ if __name__ == "__main__":
140
+ main()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ speechbrain==0.5.16
4
+ torch>=1.13.0
5
+ torchaudio>=0.13.0
6
+ numpy>=1.21.0
7
+ scipy>=1.7.0
8
+ librosa>=0.9.0
9
+ soundfile>=0.12.1
10
+ python-multipart==0.0.6