Upload 4 files
Browse files- README.md +139 -6
- app.py +135 -0
- client_test.py +140 -0
- requirements.txt +10 -0
README.md
CHANGED
@@ -1,13 +1,146 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: SpeechBrain TTS API
|
3 |
+
emoji: 🗣️
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.44.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
+
# SpeechBrain TTS API
|
14 |
+
|
15 |
+
A Text-to-Speech API built with FastAPI and SpeechBrain, running on Hugging Face Spaces.
|
16 |
+
|
17 |
+
## Features
|
18 |
+
|
19 |
+
- 🎯 **Fast TTS synthesis** using SpeechBrain's Tacotron2 + HiFiGAN
|
20 |
+
- 🔄 **Multiple output formats** (WAV stream and Base64)
|
21 |
+
- 📝 **Simple REST API** with JSON requests
|
22 |
+
- 🚀 **Ready for deployment** on Hugging Face Spaces
|
23 |
+
|
24 |
+
## API Endpoints
|
25 |
+
|
26 |
+
### `GET /`
|
27 |
+
Returns API status message.
|
28 |
+
|
29 |
+
### `GET /health`
|
30 |
+
Health check endpoint.
|
31 |
+
|
32 |
+
### `POST /synthesize`
|
33 |
+
Synthesizes speech and returns audio as WAV stream.
|
34 |
+
|
35 |
+
**Request:**
|
36 |
+
```json
|
37 |
+
{
|
38 |
+
"text": "Hello, this is a test message",
|
39 |
+
"sample_rate": 22050
|
40 |
+
}
|
41 |
+
```
|
42 |
+
|
43 |
+
**Response:** WAV audio file stream
|
44 |
+
|
45 |
+
### `POST /synthesize_base64`
|
46 |
+
Synthesizes speech and returns audio as Base64 encoded string.
|
47 |
+
|
48 |
+
**Request:**
|
49 |
+
```json
|
50 |
+
{
|
51 |
+
"text": "Hello, this is a test message",
|
52 |
+
"sample_rate": 22050
|
53 |
+
}
|
54 |
+
```
|
55 |
+
|
56 |
+
**Response:**
|
57 |
+
```json
|
58 |
+
{
|
59 |
+
"audio_base64": "UklGRkq...",
|
60 |
+
"sample_rate": 22050,
|
61 |
+
"text": "Hello, this is a test message"
|
62 |
+
}
|
63 |
+
```
|
64 |
+
|
65 |
+
## Usage Examples
|
66 |
+
|
67 |
+
### Python Client
|
68 |
+
```python
|
69 |
+
import requests
|
70 |
+
import base64
|
71 |
+
from io import BytesIO
|
72 |
+
|
73 |
+
# Text to synthesize
|
74 |
+
text = "Hello world, this is a speech synthesis test."
|
75 |
+
|
76 |
+
# Request to Base64 endpoint
|
77 |
+
response = requests.post(
|
78 |
+
"https://your-space-url.hf.space/synthesize_base64",
|
79 |
+
json={"text": text, "sample_rate": 22050}
|
80 |
+
)
|
81 |
+
|
82 |
+
if response.status_code == 200:
|
83 |
+
result = response.json()
|
84 |
+
|
85 |
+
# Decode Base64 audio
|
86 |
+
audio_data = base64.b64decode(result["audio_base64"])
|
87 |
+
|
88 |
+
# Save as WAV file
|
89 |
+
with open("output.wav", "wb") as f:
|
90 |
+
f.write(audio_data)
|
91 |
+
|
92 |
+
print("Audio saved as output.wav")
|
93 |
+
```
|
94 |
+
|
95 |
+
### JavaScript Client
|
96 |
+
```javascript
|
97 |
+
async function synthesizeSpeech(text) {
|
98 |
+
const response = await fetch('/synthesize_base64', {
|
99 |
+
method: 'POST',
|
100 |
+
headers: {
|
101 |
+
'Content-Type': 'application/json',
|
102 |
+
},
|
103 |
+
body: JSON.stringify({
|
104 |
+
text: text,
|
105 |
+
sample_rate: 22050
|
106 |
+
})
|
107 |
+
});
|
108 |
+
|
109 |
+
if (response.ok) {
|
110 |
+
const result = await response.json();
|
111 |
+
|
112 |
+
// Create audio element
|
113 |
+
const audio = new Audio();
|
114 |
+
audio.src = `data:audio/wav;base64,${result.audio_base64}`;
|
115 |
+
audio.play();
|
116 |
+
}
|
117 |
+
}
|
118 |
+
|
119 |
+
// Usage
|
120 |
+
synthesizeSpeech("Hello from JavaScript!");
|
121 |
+
```
|
122 |
+
|
123 |
+
## Local Development
|
124 |
+
|
125 |
+
1. Install dependencies:
|
126 |
+
```bash
|
127 |
+
pip install -r requirements.txt
|
128 |
+
```
|
129 |
+
|
130 |
+
2. Run the server:
|
131 |
+
```bash
|
132 |
+
python app.py
|
133 |
+
```
|
134 |
+
|
135 |
+
3. The API will be available at `http://localhost:7860`
|
136 |
+
|
137 |
+
## Model Information
|
138 |
+
|
139 |
+
- **TTS Model:** SpeechBrain Tacotron2 (LJSpeech)
|
140 |
+
- **Vocoder:** SpeechBrain HiFiGAN (LJSpeech)
|
141 |
+
- **Default Sample Rate:** 22,050 Hz
|
142 |
+
- **Text Limit:** 500 characters per request
|
143 |
+
|
144 |
+
## License
|
145 |
+
|
146 |
+
MIT License
|
app.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from fastapi.responses import StreamingResponse
|
3 |
+
from pydantic import BaseModel
|
4 |
+
import torch
|
5 |
+
import torchaudio
|
6 |
+
from speechbrain.pretrained import Tacotron2
|
7 |
+
from speechbrain.pretrained import HIFIGAN
|
8 |
+
import io
|
9 |
+
import numpy as np
|
10 |
+
import tempfile
|
11 |
+
import os
|
12 |
+
|
13 |
+
app = FastAPI(title="SpeechBrain TTS API", description="Text-to-Speech API using SpeechBrain")
|
14 |
+
|
15 |
+
class TTSRequest(BaseModel):
|
16 |
+
text: str
|
17 |
+
sample_rate: int = 22050
|
18 |
+
|
19 |
+
class TTSService:
|
20 |
+
def __init__(self):
|
21 |
+
self.tacotron2 = None
|
22 |
+
self.hifi_gan = None
|
23 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
24 |
+
|
25 |
+
def load_models(self):
|
26 |
+
if self.tacotron2 is None:
|
27 |
+
print("Loading Tacotron2 model...")
|
28 |
+
self.tacotron2 = Tacotron2.from_hparams(
|
29 |
+
source="speechbrain/tts-tacotron2-ljspeech",
|
30 |
+
savedir="tmpdir_tts"
|
31 |
+
)
|
32 |
+
|
33 |
+
if self.hifi_gan is None:
|
34 |
+
print("Loading HiFi-GAN vocoder...")
|
35 |
+
self.hifi_gan = HIFIGAN.from_hparams(
|
36 |
+
source="speechbrain/tts-hifigan-ljspeech",
|
37 |
+
savedir="tmpdir_vocoder"
|
38 |
+
)
|
39 |
+
|
40 |
+
def synthesize(self, text: str, sample_rate: int = 22050):
|
41 |
+
self.load_models()
|
42 |
+
|
43 |
+
mel_output, mel_length, alignment = self.tacotron2.encode_text(text)
|
44 |
+
|
45 |
+
waveforms = self.hifi_gan.decode_batch(mel_output)
|
46 |
+
|
47 |
+
audio_np = waveforms.squeeze().cpu().numpy()
|
48 |
+
|
49 |
+
if sample_rate != 22050:
|
50 |
+
import librosa
|
51 |
+
audio_np = librosa.resample(audio_np, orig_sr=22050, target_sr=sample_rate)
|
52 |
+
|
53 |
+
return audio_np, sample_rate
|
54 |
+
|
55 |
+
tts_service = TTSService()
|
56 |
+
|
57 |
+
@app.get("/")
|
58 |
+
async def root():
|
59 |
+
return {"message": "SpeechBrain TTS API is running!"}
|
60 |
+
|
61 |
+
@app.get("/health")
|
62 |
+
async def health_check():
|
63 |
+
return {"status": "healthy"}
|
64 |
+
|
65 |
+
@app.post("/synthesize")
|
66 |
+
async def synthesize_speech(request: TTSRequest):
|
67 |
+
try:
|
68 |
+
if not request.text or len(request.text.strip()) == 0:
|
69 |
+
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
70 |
+
|
71 |
+
if len(request.text) > 500:
|
72 |
+
raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
|
73 |
+
|
74 |
+
audio_data, sample_rate = tts_service.synthesize(request.text, request.sample_rate)
|
75 |
+
|
76 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
77 |
+
torchaudio.save(
|
78 |
+
tmp_file.name,
|
79 |
+
torch.tensor(audio_data).unsqueeze(0),
|
80 |
+
sample_rate
|
81 |
+
)
|
82 |
+
|
83 |
+
with open(tmp_file.name, "rb") as audio_file:
|
84 |
+
audio_bytes = audio_file.read()
|
85 |
+
|
86 |
+
os.unlink(tmp_file.name)
|
87 |
+
|
88 |
+
return StreamingResponse(
|
89 |
+
io.BytesIO(audio_bytes),
|
90 |
+
media_type="audio/wav",
|
91 |
+
headers={"Content-Disposition": "attachment; filename=synthesized_audio.wav"}
|
92 |
+
)
|
93 |
+
|
94 |
+
except Exception as e:
|
95 |
+
raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
|
96 |
+
|
97 |
+
@app.post("/synthesize_base64")
|
98 |
+
async def synthesize_speech_base64(request: TTSRequest):
|
99 |
+
import base64
|
100 |
+
|
101 |
+
try:
|
102 |
+
if not request.text or len(request.text.strip()) == 0:
|
103 |
+
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
104 |
+
|
105 |
+
if len(request.text) > 500:
|
106 |
+
raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
|
107 |
+
|
108 |
+
audio_data, sample_rate = tts_service.synthesize(request.text, request.sample_rate)
|
109 |
+
|
110 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
111 |
+
torchaudio.save(
|
112 |
+
tmp_file.name,
|
113 |
+
torch.tensor(audio_data).unsqueeze(0),
|
114 |
+
sample_rate
|
115 |
+
)
|
116 |
+
|
117 |
+
with open(tmp_file.name, "rb") as audio_file:
|
118 |
+
audio_bytes = audio_file.read()
|
119 |
+
|
120 |
+
os.unlink(tmp_file.name)
|
121 |
+
|
122 |
+
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
|
123 |
+
|
124 |
+
return {
|
125 |
+
"audio_base64": audio_base64,
|
126 |
+
"sample_rate": sample_rate,
|
127 |
+
"text": request.text
|
128 |
+
}
|
129 |
+
|
130 |
+
except Exception as e:
|
131 |
+
raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
|
132 |
+
|
133 |
+
if __name__ == "__main__":
|
134 |
+
import uvicorn
|
135 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
client_test.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import base64
|
3 |
+
import json
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
class TTSClient:
|
7 |
+
def __init__(self, base_url="http://localhost:7860"):
|
8 |
+
self.base_url = base_url.rstrip("/")
|
9 |
+
|
10 |
+
def health_check(self):
|
11 |
+
"""APIの健康状態をチェック"""
|
12 |
+
try:
|
13 |
+
response = requests.get(f"{self.base_url}/health")
|
14 |
+
return response.status_code == 200, response.json()
|
15 |
+
except Exception as e:
|
16 |
+
return False, str(e)
|
17 |
+
|
18 |
+
def synthesize_to_file(self, text, output_path="output.wav", sample_rate=22050):
|
19 |
+
"""テキストを音声に変換してファイルに保存"""
|
20 |
+
try:
|
21 |
+
response = requests.post(
|
22 |
+
f"{self.base_url}/synthesize",
|
23 |
+
json={"text": text, "sample_rate": sample_rate}
|
24 |
+
)
|
25 |
+
|
26 |
+
if response.status_code == 200:
|
27 |
+
with open(output_path, "wb") as f:
|
28 |
+
f.write(response.content)
|
29 |
+
return True, f"Audio saved to {output_path}"
|
30 |
+
else:
|
31 |
+
return False, f"Error: {response.status_code} - {response.text}"
|
32 |
+
|
33 |
+
except Exception as e:
|
34 |
+
return False, str(e)
|
35 |
+
|
36 |
+
def synthesize_to_base64(self, text, sample_rate=22050):
|
37 |
+
"""テキストを音声に変換してBase64形式で取得"""
|
38 |
+
try:
|
39 |
+
response = requests.post(
|
40 |
+
f"{self.base_url}/synthesize_base64",
|
41 |
+
json={"text": text, "sample_rate": sample_rate}
|
42 |
+
)
|
43 |
+
|
44 |
+
if response.status_code == 200:
|
45 |
+
return True, response.json()
|
46 |
+
else:
|
47 |
+
return False, f"Error: {response.status_code} - {response.text}"
|
48 |
+
|
49 |
+
except Exception as e:
|
50 |
+
return False, str(e)
|
51 |
+
|
52 |
+
def save_base64_audio(self, audio_base64, output_path="output_from_base64.wav"):
|
53 |
+
"""Base64形式の音声データをファイルに保存"""
|
54 |
+
try:
|
55 |
+
audio_data = base64.b64decode(audio_base64)
|
56 |
+
with open(output_path, "wb") as f:
|
57 |
+
f.write(audio_data)
|
58 |
+
return True, f"Audio saved to {output_path}"
|
59 |
+
except Exception as e:
|
60 |
+
return False, str(e)
|
61 |
+
|
62 |
+
def main():
|
63 |
+
# クライアントを初期化(ローカル開発用)
|
64 |
+
client = TTSClient("http://localhost:7860")
|
65 |
+
|
66 |
+
# Hugging Face Spacesデプロイ用(URLを適切に変更してください)
|
67 |
+
# client = TTSClient("https://your-space-name.hf.space")
|
68 |
+
|
69 |
+
print("=== SpeechBrain TTS API Client Test ===\n")
|
70 |
+
|
71 |
+
# 健康状態チェック
|
72 |
+
print("1. Health Check:")
|
73 |
+
is_healthy, health_result = client.health_check()
|
74 |
+
print(f" Status: {'✓ Healthy' if is_healthy else '✗ Unhealthy'}")
|
75 |
+
print(f" Response: {health_result}\n")
|
76 |
+
|
77 |
+
if not is_healthy:
|
78 |
+
print("API is not available. Please check if the server is running.")
|
79 |
+
return
|
80 |
+
|
81 |
+
# テスト用テキスト
|
82 |
+
test_texts = [
|
83 |
+
"Hello, this is a test of the SpeechBrain TTS API.",
|
84 |
+
"The quick brown fox jumps over the lazy dog.",
|
85 |
+
"Welcome to Hugging Face Spaces!"
|
86 |
+
]
|
87 |
+
|
88 |
+
# テスト1: WAVファイル直接保存
|
89 |
+
print("2. Testing direct WAV file synthesis:")
|
90 |
+
for i, text in enumerate(test_texts):
|
91 |
+
print(f" Testing: '{text}'")
|
92 |
+
success, result = client.synthesize_to_file(
|
93 |
+
text,
|
94 |
+
f"test_output_{i+1}.wav",
|
95 |
+
sample_rate=22050
|
96 |
+
)
|
97 |
+
print(f" Result: {'✓ Success' if success else '✗ Failed'} - {result}")
|
98 |
+
print()
|
99 |
+
|
100 |
+
# テスト2: Base64形式での取得
|
101 |
+
print("3. Testing Base64 synthesis:")
|
102 |
+
for i, text in enumerate(test_texts):
|
103 |
+
print(f" Testing: '{text}'")
|
104 |
+
success, result = client.synthesize_to_base64(text, sample_rate=22050)
|
105 |
+
|
106 |
+
if success:
|
107 |
+
print(f" ✓ Success - Audio length: {len(result['audio_base64'])} chars")
|
108 |
+
|
109 |
+
# Base64からファイルに保存
|
110 |
+
save_success, save_result = client.save_base64_audio(
|
111 |
+
result['audio_base64'],
|
112 |
+
f"test_base64_{i+1}.wav"
|
113 |
+
)
|
114 |
+
print(f" Save result: {'✓ Success' if save_success else '✗ Failed'} - {save_result}")
|
115 |
+
else:
|
116 |
+
print(f" ✗ Failed - {result}")
|
117 |
+
print()
|
118 |
+
|
119 |
+
# テスト3: エラーハンドリング
|
120 |
+
print("4. Testing error handling:")
|
121 |
+
|
122 |
+
# 空のテキスト
|
123 |
+
print(" Testing empty text:")
|
124 |
+
success, result = client.synthesize_to_file("", "empty_test.wav")
|
125 |
+
print(f" Result: {'✓ Success' if success else '✗ Expected failure'} - {result}")
|
126 |
+
|
127 |
+
# 長すぎるテキスト
|
128 |
+
print(" Testing too long text:")
|
129 |
+
long_text = "This is a very long text. " * 50 # 500文字以上
|
130 |
+
success, result = client.synthesize_to_file(long_text, "long_test.wav")
|
131 |
+
print(f" Result: {'✓ Success' if success else '✗ Expected failure'} - {result}")
|
132 |
+
|
133 |
+
print("\n=== Test Complete ===")
|
134 |
+
print("Check the generated audio files:")
|
135 |
+
for i in range(len(test_texts)):
|
136 |
+
print(f" - test_output_{i+1}.wav")
|
137 |
+
print(f" - test_base64_{i+1}.wav")
|
138 |
+
|
139 |
+
if __name__ == "__main__":
|
140 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.104.1
|
2 |
+
uvicorn[standard]==0.24.0
|
3 |
+
speechbrain==0.5.16
|
4 |
+
torch>=1.13.0
|
5 |
+
torchaudio>=0.13.0
|
6 |
+
numpy>=1.21.0
|
7 |
+
scipy>=1.7.0
|
8 |
+
librosa>=0.9.0
|
9 |
+
soundfile>=0.12.1
|
10 |
+
python-multipart==0.0.6
|