Spaces:
Sleeping
Sleeping
Avinyaa
commited on
Commit
·
a7aae29
1
Parent(s):
703fff1
new
Browse files- Dockerfile +18 -4
- README.md +191 -149
- app.py +324 -83
- client_example.py +183 -45
- requirements.txt +14 -4
- test.py +135 -18
Dockerfile
CHANGED
@@ -4,7 +4,12 @@ FROM python:3.11
|
|
4 |
RUN useradd -m -u 1000 user
|
5 |
|
6 |
# Install system dependencies as root
|
7 |
-
RUN apt-get update && apt-get install -y
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# Initialize git lfs
|
10 |
RUN git lfs install
|
@@ -15,7 +20,10 @@ USER user
|
|
15 |
# Set home to the user's home directory
|
16 |
ENV HOME=/home/user \
|
17 |
PATH=/home/user/.local/bin:$PATH \
|
18 |
-
|
|
|
|
|
|
|
19 |
|
20 |
# Set the working directory to the user's home directory
|
21 |
WORKDIR $HOME/app
|
@@ -27,11 +35,17 @@ RUN pip install --no-cache-dir --upgrade pip
|
|
27 |
COPY --chown=user requirements.txt .
|
28 |
RUN pip install --no-cache-dir -r requirements.txt
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
31 |
COPY --chown=user . $HOME/app
|
32 |
|
33 |
# Expose the port
|
34 |
EXPOSE 7860
|
35 |
|
36 |
-
#
|
37 |
-
CMD ["
|
|
|
4 |
RUN useradd -m -u 1000 user
|
5 |
|
6 |
# Install system dependencies as root
|
7 |
+
RUN apt-get update && apt-get install -y \
|
8 |
+
git \
|
9 |
+
git-lfs \
|
10 |
+
espeak-ng \
|
11 |
+
ffmpeg \
|
12 |
+
&& rm -rf /var/lib/apt/lists/*
|
13 |
|
14 |
# Initialize git lfs
|
15 |
RUN git lfs install
|
|
|
20 |
# Set home to the user's home directory
|
21 |
ENV HOME=/home/user \
|
22 |
PATH=/home/user/.local/bin:$PATH \
|
23 |
+
COQUI_TOS_AGREED=1 \
|
24 |
+
NUMBA_DISABLE_JIT=1 \
|
25 |
+
FORCE_CPU=true \
|
26 |
+
CUDA_VISIBLE_DEVICES=""
|
27 |
|
28 |
# Set the working directory to the user's home directory
|
29 |
WORKDIR $HOME/app
|
|
|
35 |
COPY --chown=user requirements.txt .
|
36 |
RUN pip install --no-cache-dir -r requirements.txt
|
37 |
|
38 |
+
# Download unidic for mecab (required for some TTS features)
|
39 |
+
RUN python -m unidic download
|
40 |
+
|
41 |
+
# Clone the C3PO XTTS model
|
42 |
+
RUN git clone https://huggingface.co/Borcherding/XTTS-v2_C3PO XTTS-v2_C3PO
|
43 |
+
|
44 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
45 |
COPY --chown=user . $HOME/app
|
46 |
|
47 |
# Expose the port
|
48 |
EXPOSE 7860
|
49 |
|
50 |
+
# Start the API directly
|
51 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,244 +1,286 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: indigo
|
5 |
colorTo: yellow
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
-
#
|
11 |
|
12 |
-
A FastAPI-based Text-to-Speech API using
|
13 |
|
14 |
## Features
|
15 |
|
16 |
-
-
|
17 |
-
-
|
18 |
-
-
|
19 |
-
-
|
20 |
-
-
|
21 |
-
-
|
22 |
-
-
|
23 |
-
- Optimized for Hugging Face Spaces deployment
|
24 |
|
25 |
-
## About
|
26 |
|
27 |
-
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
This API is optimized for Hugging Face Spaces deployment. The Docker configuration automatically handles:
|
34 |
-
- Cache directory setup with proper permissions
|
35 |
-
- Environment variable configuration
|
36 |
-
- Model downloading and caching
|
37 |
-
|
38 |
-
Simply deploy to Hugging Face Spaces using the Docker SDK.
|
39 |
|
40 |
-
|
41 |
|
42 |
-
|
43 |
|
44 |
-
1. Change the Dockerfile CMD to: `CMD ["python", "startup.py"]`
|
45 |
-
2. This will run diagnostics and show detailed information about the environment
|
46 |
-
|
47 |
-
### Local Development
|
48 |
-
|
49 |
-
1. Install system dependencies:
|
50 |
```bash
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
brew install espeak
|
56 |
```
|
57 |
|
58 |
-
|
59 |
-
```bash
|
60 |
-
pip install -r requirements.txt
|
61 |
-
```
|
62 |
|
63 |
-
3. Run the API:
|
64 |
```bash
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
### Using Docker
|
71 |
-
|
72 |
-
1. Build the Docker image:
|
73 |
-
```bash
|
74 |
-
docker build -t kokoro-tts-api .
|
75 |
-
```
|
76 |
-
|
77 |
-
2. Run the container:
|
78 |
-
```bash
|
79 |
-
docker run -p 7860:7860 kokoro-tts-api
|
80 |
```
|
81 |
|
82 |
## API Endpoints
|
83 |
|
84 |
-
###
|
85 |
-
- **
|
86 |
-
|
87 |
-
|
88 |
-
-
|
|
|
89 |
|
90 |
-
###
|
91 |
-
- **POST** `/tts` - Convert text to speech
|
92 |
- **Parameters:**
|
93 |
-
- `text` (form): Text to convert to speech
|
94 |
-
- `
|
95 |
-
- `
|
|
|
|
|
96 |
|
97 |
-
###
|
98 |
- **POST** `/tts-json` - Convert text to speech using JSON request body
|
99 |
-
- **Body:** JSON object with `text`, `
|
|
|
100 |
|
101 |
-
###
|
|
|
|
|
102 |
- **GET** `/docs` - Interactive API documentation (Swagger UI)
|
103 |
-
- **GET** `/redoc` - Alternative API documentation
|
104 |
-
|
105 |
-
## Available Voices
|
106 |
-
|
107 |
-
- `af_heart` - Female voice (Heart)
|
108 |
-
- `af_sky` - Female voice (Sky)
|
109 |
-
- `af_bella` - Female voice (Bella)
|
110 |
-
- `af_sarah` - Female voice (Sarah)
|
111 |
-
- `af_nicole` - Female voice (Nicole)
|
112 |
-
- `am_adam` - Male voice (Adam)
|
113 |
-
- `am_michael` - Male voice (Michael)
|
114 |
-
- `am_edward` - Male voice (Edward)
|
115 |
-
- `am_lewis` - Male voice (Lewis)
|
116 |
|
117 |
## Usage Examples
|
118 |
|
119 |
-
###
|
120 |
|
121 |
```python
|
122 |
import requests
|
123 |
|
124 |
-
#
|
125 |
-
url = "http://localhost:7860/tts"
|
126 |
data = {
|
127 |
-
"text": "Hello
|
128 |
-
"
|
129 |
-
"lang_code": "a"
|
130 |
}
|
131 |
|
132 |
-
# Make the request
|
133 |
response = requests.post(url, data=data)
|
134 |
|
135 |
-
# Save the generated audio
|
136 |
if response.status_code == 200:
|
137 |
-
with open("
|
138 |
f.write(response.content)
|
139 |
-
print("
|
140 |
```
|
141 |
|
142 |
-
###
|
143 |
|
144 |
```python
|
145 |
import requests
|
146 |
|
147 |
-
|
148 |
-
url = "http://localhost:7860/tts-json"
|
149 |
data = {
|
150 |
-
"text": "
|
151 |
-
"
|
152 |
-
"lang_code": "a"
|
153 |
}
|
154 |
|
155 |
-
|
156 |
-
|
157 |
-
# Make the request
|
158 |
-
response = requests.post(url, json=data, headers=headers)
|
159 |
|
160 |
-
# Save the generated audio
|
161 |
if response.status_code == 200:
|
162 |
-
with open("
|
163 |
f.write(response.content)
|
164 |
-
print("Speech generated successfully!")
|
165 |
```
|
166 |
|
167 |
-
###
|
168 |
|
169 |
-
```
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
|
|
175 |
```
|
176 |
|
177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
```bash
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
|
|
184 |
```
|
185 |
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
|
|
188 |
```bash
|
189 |
-
|
190 |
```
|
191 |
|
192 |
-
### Using
|
193 |
|
194 |
```bash
|
195 |
-
|
|
|
|
|
196 |
```
|
197 |
|
198 |
-
##
|
|
|
|
|
199 |
|
200 |
-
-
|
201 |
-
|
202 |
-
|
|
|
|
|
203 |
|
204 |
## Model Information
|
205 |
|
206 |
-
|
207 |
-
-
|
208 |
-
-
|
209 |
-
-
|
210 |
-
-
|
211 |
-
- Requires minimal system resources compared to larger models
|
212 |
|
213 |
## Testing
|
214 |
|
215 |
-
Run the
|
216 |
```bash
|
|
|
217 |
python test.py
|
|
|
|
|
|
|
218 |
```
|
219 |
|
220 |
-
|
221 |
-
|
222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
```
|
224 |
|
225 |
-
|
226 |
-
```
|
227 |
-
|
|
|
|
|
228 |
```
|
229 |
|
230 |
-
|
231 |
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
-
The
|
235 |
|
236 |
-
|
237 |
-
- `TRANSFORMERS_CACHE=/tmp/hf_cache` - Transformers cache
|
238 |
-
- `HF_HUB_CACHE=/tmp/hf_cache` - HF Hub cache
|
239 |
-
- `TORCH_HOME=/tmp/torch_cache` - PyTorch cache
|
240 |
-
- `NUMBA_CACHE_DIR=/tmp/numba_cache` - Numba cache
|
241 |
-
- `NUMBA_DISABLE_JIT=1` - Disable Numba JIT compilation
|
242 |
|
243 |
-
|
|
|
|
|
244 |
|
|
|
1 |
---
|
2 |
+
title: XTTS C3PO Voice Cloning API
|
3 |
+
emoji: 🤖
|
4 |
colorFrom: indigo
|
5 |
colorTo: yellow
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
+
# XTTS C3PO Voice Cloning API
|
11 |
|
12 |
+
A FastAPI-based Text-to-Speech API using XTTS-v2 with the iconic C3PO voice from Star Wars.
|
13 |
|
14 |
## Features
|
15 |
|
16 |
+
- **C3PO Voice**: Pre-loaded with the iconic C3PO voice from Star Wars
|
17 |
+
- **Custom Voice Cloning**: Upload your own reference audio for voice cloning
|
18 |
+
- **Multilingual Support**: 16+ languages with C3PO voice
|
19 |
+
- **No Upload Required**: Use C3PO voice without any file uploads
|
20 |
+
- **RESTful API**: Clean API with automatic documentation
|
21 |
+
- **Docker Support**: Optimized for Hugging Face Spaces deployment
|
22 |
+
- **PyTorch 2.6 Compatible**: Includes compatibility fixes
|
|
|
23 |
|
24 |
+
## About the C3PO Model
|
25 |
|
26 |
+
This API uses the XTTS-v2 C3PO model from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO), which provides the iconic voice of C-3PO from Star Wars. The model supports:
|
27 |
|
28 |
+
- High-quality C3PO voice synthesis
|
29 |
+
- Multilingual C3PO speech (16+ languages)
|
30 |
+
- Custom voice cloning capabilities
|
31 |
+
- Real-time speech generation
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
## Quick Start
|
34 |
|
35 |
+
### Using C3PO Voice (No Upload Required)
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
```bash
|
38 |
+
curl -X POST "http://localhost:7860/tts-c3po" \
|
39 |
+
-F "text=Hello there! I am C-3PO, human-cyborg relations." \
|
40 |
+
-F "language=en" \
|
41 |
+
--output c3po_speech.wav
|
|
|
42 |
```
|
43 |
|
44 |
+
### Using Custom Voice Cloning
|
|
|
|
|
|
|
45 |
|
|
|
46 |
```bash
|
47 |
+
curl -X POST "http://localhost:7860/tts" \
|
48 |
+
-F "text=This will be spoken in your custom voice!" \
|
49 |
+
-F "language=en" \
|
50 |
+
-F "speaker_file=@your_reference_voice.wav" \
|
51 |
+
--output custom_speech.wav
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
```
|
53 |
|
54 |
## API Endpoints
|
55 |
|
56 |
+
### C3PO Voice Only
|
57 |
+
- **POST** `/tts-c3po` - Generate speech using C3PO voice (no file upload needed)
|
58 |
+
- **Parameters:**
|
59 |
+
- `text` (form): Text to convert to speech (max 500 characters)
|
60 |
+
- `language` (form): Language code (default: "en")
|
61 |
+
- `no_lang_auto_detect` (form): Disable automatic language detection
|
62 |
|
63 |
+
### Voice Cloning with Fallback
|
64 |
+
- **POST** `/tts` - Convert text to speech with optional custom voice
|
65 |
- **Parameters:**
|
66 |
+
- `text` (form): Text to convert to speech (max 500 characters)
|
67 |
+
- `language` (form): Language code (default: "en")
|
68 |
+
- `voice_cleanup` (form): Apply audio cleanup to reference voice
|
69 |
+
- `no_lang_auto_detect` (form): Disable automatic language detection
|
70 |
+
- `speaker_file` (file, optional): Reference speaker audio file (uses C3PO if not provided)
|
71 |
|
72 |
+
### JSON API
|
73 |
- **POST** `/tts-json` - Convert text to speech using JSON request body
|
74 |
+
- **Body:** JSON object with `text`, `language`, `voice_cleanup`, `no_lang_auto_detect`
|
75 |
+
- **File:** `speaker_file` (optional) - Reference speaker audio file
|
76 |
|
77 |
+
### Information Endpoints
|
78 |
+
- **GET** `/health` - Check API status, device info, and supported languages
|
79 |
+
- **GET** `/languages` - Get list of supported languages
|
80 |
- **GET** `/docs` - Interactive API documentation (Swagger UI)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
## Usage Examples
|
83 |
|
84 |
+
### Python - C3PO Voice
|
85 |
|
86 |
```python
|
87 |
import requests
|
88 |
|
89 |
+
# Generate C3PO speech
|
90 |
+
url = "http://localhost:7860/tts-c3po"
|
91 |
data = {
|
92 |
+
"text": "Hello there! I am C-3PO, human-cyborg relations.",
|
93 |
+
"language": "en"
|
|
|
94 |
}
|
95 |
|
|
|
96 |
response = requests.post(url, data=data)
|
97 |
|
|
|
98 |
if response.status_code == 200:
|
99 |
+
with open("c3po_speech.wav", "wb") as f:
|
100 |
f.write(response.content)
|
101 |
+
print("C3PO speech generated!")
|
102 |
```
|
103 |
|
104 |
+
### Python - Custom Voice with C3PO Fallback
|
105 |
|
106 |
```python
|
107 |
import requests
|
108 |
|
109 |
+
url = "http://localhost:7860/tts"
|
|
|
110 |
data = {
|
111 |
+
"text": "This will use C3PO voice if no speaker file is provided.",
|
112 |
+
"language": "en"
|
|
|
113 |
}
|
114 |
|
115 |
+
# No speaker_file provided - will use C3PO voice
|
116 |
+
response = requests.post(url, data=data)
|
|
|
|
|
117 |
|
|
|
118 |
if response.status_code == 200:
|
119 |
+
with open("speech_output.wav", "wb") as f:
|
120 |
f.write(response.content)
|
|
|
121 |
```
|
122 |
|
123 |
+
### Multilingual C3PO
|
124 |
|
125 |
+
```python
|
126 |
+
# C3PO speaking Spanish
|
127 |
+
data = {
|
128 |
+
"text": "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación.",
|
129 |
+
"language": "es"
|
130 |
+
}
|
131 |
+
response = requests.post("http://localhost:7860/tts-c3po", data=data)
|
132 |
```
|
133 |
|
134 |
+
## Supported Languages
|
135 |
+
|
136 |
+
The C3PO model supports all XTTS-v2 languages:
|
137 |
+
|
138 |
+
- **en** - English
|
139 |
+
- **es** - Spanish
|
140 |
+
- **fr** - French
|
141 |
+
- **de** - German
|
142 |
+
- **it** - Italian
|
143 |
+
- **pt** - Portuguese (Brazilian)
|
144 |
+
- **pl** - Polish
|
145 |
+
- **tr** - Turkish
|
146 |
+
- **ru** - Russian
|
147 |
+
- **nl** - Dutch
|
148 |
+
- **cs** - Czech
|
149 |
+
- **ar** - Arabic
|
150 |
+
- **zh-cn** - Mandarin Chinese
|
151 |
+
- **ja** - Japanese
|
152 |
+
- **ko** - Korean
|
153 |
+
- **hu** - Hungarian
|
154 |
+
- **hi** - Hindi
|
155 |
+
|
156 |
+
## Setup
|
157 |
+
|
158 |
+
### Hugging Face Spaces Deployment
|
159 |
|
160 |
+
This API is optimized for Hugging Face Spaces with:
|
161 |
+
- Automatic C3PO model downloading
|
162 |
+
- Proper user permissions (user ID 1000)
|
163 |
+
- PyTorch 2.6 compatibility fixes
|
164 |
+
- COQUI license agreement handling
|
165 |
+
|
166 |
+
### Local Development
|
167 |
+
|
168 |
+
1. **Install system dependencies:**
|
169 |
```bash
|
170 |
+
# Ubuntu/Debian
|
171 |
+
sudo apt-get install espeak-ng ffmpeg git git-lfs
|
172 |
+
|
173 |
+
# macOS
|
174 |
+
brew install espeak ffmpeg git git-lfs
|
175 |
```
|
176 |
|
177 |
+
2. **Install Python dependencies:**
|
178 |
+
```bash
|
179 |
+
pip install -r requirements.txt
|
180 |
+
python -m unidic download
|
181 |
+
```
|
182 |
+
|
183 |
+
3. **Clone C3PO model (optional - auto-downloaded on first run):**
|
184 |
+
```bash
|
185 |
+
git clone https://huggingface.co/Borcherding/XTTS-v2_C3PO XTTS-v2_C3PO
|
186 |
+
```
|
187 |
|
188 |
+
4. **Run the API:**
|
189 |
```bash
|
190 |
+
uvicorn app:app --host 0.0.0.0 --port 7860
|
191 |
```
|
192 |
|
193 |
+
### Using Docker
|
194 |
|
195 |
```bash
|
196 |
+
# Build and run
|
197 |
+
docker build -t xtts-c3po-api .
|
198 |
+
docker run -p 7860:7860 xtts-c3po-api
|
199 |
```
|
200 |
|
201 |
+
## Reference Audio Guidelines
|
202 |
+
|
203 |
+
For custom voice cloning:
|
204 |
|
205 |
+
1. **Duration**: 3-10 seconds of clear speech
|
206 |
+
2. **Quality**: High-quality audio, minimal background noise
|
207 |
+
3. **Format**: WAV format recommended (MP3, M4A also supported)
|
208 |
+
4. **Content**: Natural speech, avoid music or effects
|
209 |
+
5. **Speaker**: Single speaker, clear pronunciation
|
210 |
|
211 |
## Model Information
|
212 |
|
213 |
+
- **Base Model**: XTTS-v2
|
214 |
+
- **Voice**: C3PO from Star Wars
|
215 |
+
- **Source**: [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO)
|
216 |
+
- **Languages**: 16+ supported
|
217 |
+
- **License**: CPML (Coqui Public Model License)
|
|
|
218 |
|
219 |
## Testing
|
220 |
|
221 |
+
Run the test suite:
|
222 |
```bash
|
223 |
+
# Test C3PO model functionality
|
224 |
python test.py
|
225 |
+
|
226 |
+
# Test API endpoints
|
227 |
+
python client_example.py
|
228 |
```
|
229 |
|
230 |
+
## Environment Variables
|
231 |
+
|
232 |
+
Automatically configured:
|
233 |
+
- `COQUI_TOS_AGREED=1` - Agrees to CPML license
|
234 |
+
- `NUMBA_DISABLE_JIT=1` - Disables Numba JIT compilation
|
235 |
+
|
236 |
+
## API Response Examples
|
237 |
+
|
238 |
+
### Health Check Response
|
239 |
+
```json
|
240 |
+
{
|
241 |
+
"status": "healthy",
|
242 |
+
"device": "cuda",
|
243 |
+
"model": "XTTS-v2 C3PO",
|
244 |
+
"default_voice": "C3PO",
|
245 |
+
"supported_languages": ["en", "es", "fr", ...]
|
246 |
+
}
|
247 |
```
|
248 |
|
249 |
+
### Languages Response
|
250 |
+
```json
|
251 |
+
{
|
252 |
+
"languages": ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "ko", "hu", "hi"]
|
253 |
+
}
|
254 |
```
|
255 |
|
256 |
+
## Troubleshooting
|
257 |
|
258 |
+
### PyTorch Loading Issues
|
259 |
+
The API includes fixes for PyTorch 2.6's `weights_only=True` default. If you encounter loading issues, ensure the compatibility fix is applied.
|
260 |
+
|
261 |
+
### Model Download Issues
|
262 |
+
If the C3PO model fails to download:
|
263 |
+
1. Check internet connection
|
264 |
+
2. Verify git and git-lfs are installed
|
265 |
+
3. Manually clone: `git clone https://huggingface.co/Borcherding/XTTS-v2_C3PO XTTS-v2_C3PO`
|
266 |
+
|
267 |
+
### Audio Quality Issues
|
268 |
+
- Use high-quality reference audio for custom voices
|
269 |
+
- Enable `voice_cleanup` for noisy reference audio
|
270 |
+
- Ensure reference audio is 3-10 seconds long
|
271 |
+
|
272 |
+
### Memory Issues
|
273 |
+
- Use CPU mode for lower memory usage: set `CUDA_VISIBLE_DEVICES=""`
|
274 |
+
- Reduce text length for batch processing
|
275 |
+
- Consider using GPU with sufficient VRAM (4GB+ recommended)
|
276 |
+
|
277 |
+
## License
|
278 |
|
279 |
+
This project uses XTTS-v2 which is licensed under the Coqui Public Model License (CPML). The C3PO model is provided by the community. See https://coqui.ai/cpml for license details.
|
280 |
|
281 |
+
## Credits
|
|
|
|
|
|
|
|
|
|
|
282 |
|
283 |
+
- **XTTS-v2**: Coqui AI
|
284 |
+
- **C3PO Model**: [Borcherding](https://huggingface.co/Borcherding)
|
285 |
+
- **Original Character**: C-3PO from Star Wars (Lucasfilm/Disney)
|
286 |
|
app.py
CHANGED
@@ -1,173 +1,414 @@
|
|
1 |
# Import configuration first to setup environment
|
2 |
import app_config
|
3 |
|
4 |
-
from fastapi import FastAPI, HTTPException, Form
|
5 |
-
from fastapi.responses import FileResponse
|
6 |
-
from pydantic import BaseModel
|
7 |
-
from kokoro import KPipeline
|
8 |
-
import soundfile as sf
|
9 |
-
import torch
|
10 |
import os
|
11 |
-
import
|
|
|
|
|
12 |
import uuid
|
|
|
|
|
|
|
|
|
13 |
import logging
|
14 |
from typing import Optional
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
# Configure logging
|
17 |
logging.basicConfig(level=logging.INFO)
|
18 |
logger = logging.getLogger(__name__)
|
19 |
|
20 |
-
app = FastAPI(title="
|
21 |
|
22 |
class TTSRequest(BaseModel):
|
23 |
text: str
|
24 |
-
|
25 |
-
|
|
|
26 |
|
27 |
-
class
|
28 |
def __init__(self):
|
29 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
30 |
logger.info(f"Using device: {self.device}")
|
31 |
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
try:
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
43 |
|
44 |
-
def generate_speech(self, text: str,
|
|
|
45 |
"""Generate speech and return the path to the output file"""
|
46 |
try:
|
47 |
-
#
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
|
58 |
|
59 |
-
#
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
break # Take the first generated audio
|
65 |
|
66 |
return output_path
|
|
|
67 |
except Exception as e:
|
68 |
logger.error(f"Error generating speech: {e}")
|
|
|
|
|
69 |
raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
return [
|
75 |
-
"af_heart", "af_bella", "af_nicole", "af_aoede", "af_kore",
|
76 |
-
"af_sarah", "af_nova", "af_sky", "af_alloy", "af_jessica", "af_river",
|
77 |
-
"am_michael", "am_fenrir", "am_puck", "am_echo", "am_eric",
|
78 |
-
"am_liam", "am_onyx", "am_santa", "am_adam",
|
79 |
-
"bf_emma", "bf_isabella", "bf_alice", "bf_lily",
|
80 |
-
"bm_george", "bm_fable", "bm_lewis", "bm_daniel"
|
81 |
-
]
|
82 |
-
|
83 |
-
# Initialize Kokoro TTS service
|
84 |
-
tts_service = KokoroTTSService()
|
85 |
|
86 |
@app.get("/")
|
87 |
async def root():
|
88 |
-
return {"message": "
|
89 |
|
90 |
@app.get("/health")
|
91 |
async def health_check():
|
92 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
@app.get("/
|
95 |
-
async def
|
96 |
-
"""Get list of
|
97 |
-
return {"
|
98 |
|
99 |
@app.post("/tts")
|
100 |
async def text_to_speech(
|
101 |
text: str = Form(...),
|
102 |
-
|
103 |
-
|
|
|
|
|
104 |
):
|
105 |
"""
|
106 |
-
Convert text to speech using
|
107 |
|
108 |
-
- **text**: The text to convert to speech
|
109 |
-
- **
|
110 |
-
- **
|
|
|
|
|
111 |
"""
|
112 |
|
113 |
if not text.strip():
|
114 |
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
115 |
|
116 |
-
|
117 |
-
available_voices = tts_service.get_available_voices()
|
118 |
-
if voice not in available_voices:
|
119 |
-
raise HTTPException(
|
120 |
-
status_code=400,
|
121 |
-
detail=f"Voice '{voice}' not available. Available voices: {available_voices}"
|
122 |
-
)
|
123 |
|
124 |
try:
|
125 |
-
#
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
# Return the generated audio file
|
|
|
129 |
return FileResponse(
|
130 |
output_path,
|
131 |
media_type="audio/wav",
|
132 |
-
filename=f"
|
133 |
headers={"Content-Disposition": "attachment"}
|
134 |
)
|
135 |
|
136 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
logger.error(f"Error in TTS endpoint: {e}")
|
|
|
|
|
138 |
raise HTTPException(status_code=500, detail=str(e))
|
139 |
|
140 |
@app.post("/tts-json")
|
141 |
-
async def text_to_speech_json(
|
|
|
|
|
|
|
142 |
"""
|
143 |
Convert text to speech using JSON request body
|
144 |
|
145 |
-
- **request**: TTSRequest containing text,
|
|
|
146 |
"""
|
147 |
|
148 |
if not request.text.strip():
|
149 |
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
150 |
|
151 |
-
|
152 |
-
available_voices = tts_service.get_available_voices()
|
153 |
-
if request.voice not in available_voices:
|
154 |
-
raise HTTPException(
|
155 |
-
status_code=400,
|
156 |
-
detail=f"Voice '{request.voice}' not available. Available voices: {available_voices}"
|
157 |
-
)
|
158 |
|
159 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
# Generate speech
|
161 |
-
output_path = tts_service.generate_speech(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
# Return the generated audio file
|
|
|
164 |
return FileResponse(
|
165 |
output_path,
|
166 |
media_type="audio/wav",
|
167 |
-
filename=f"
|
168 |
headers={"Content-Disposition": "attachment"}
|
169 |
)
|
170 |
|
171 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
logger.error(f"Error in TTS JSON endpoint: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
1 |
# Import configuration first to setup environment
|
2 |
import app_config
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import os
|
5 |
+
import sys
|
6 |
+
import io
|
7 |
+
import subprocess
|
8 |
import uuid
|
9 |
+
import time
|
10 |
+
import torch
|
11 |
+
import torchaudio
|
12 |
+
import tempfile
|
13 |
import logging
|
14 |
from typing import Optional
|
15 |
|
16 |
+
# Fix PyTorch weights_only issue for XTTS
|
17 |
+
import torch.serialization
|
18 |
+
from TTS.tts.configs.xtts_config import XttsConfig
|
19 |
+
torch.serialization.add_safe_globals([XttsConfig])
|
20 |
+
|
21 |
+
# Set environment variables
|
22 |
+
os.environ["COQUI_TOS_AGREED"] = "1"
|
23 |
+
os.environ["NUMBA_DISABLE_JIT"] = "1"
|
24 |
+
|
25 |
+
# Force CPU usage if specified
|
26 |
+
if os.environ.get("FORCE_CPU", "false").lower() == "true":
|
27 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
28 |
+
|
29 |
+
from fastapi import FastAPI, HTTPException, UploadFile, File, Form
|
30 |
+
from fastapi.responses import FileResponse
|
31 |
+
from pydantic import BaseModel
|
32 |
+
import langid
|
33 |
+
from scipy.io.wavfile import write
|
34 |
+
from pydub import AudioSegment
|
35 |
+
|
36 |
+
from TTS.api import TTS
|
37 |
+
from TTS.tts.configs.xtts_config import XttsConfig
|
38 |
+
from TTS.tts.models.xtts import Xtts
|
39 |
+
from TTS.utils.generic_utils import get_user_data_dir
|
40 |
+
|
41 |
# Configure logging
|
42 |
logging.basicConfig(level=logging.INFO)
|
43 |
logger = logging.getLogger(__name__)
|
44 |
|
45 |
+
app = FastAPI(title="XTTS C3PO API", description="Text-to-Speech API using XTTS-v2 C3PO model", version="1.0.0")
|
46 |
|
47 |
class TTSRequest(BaseModel):
|
48 |
text: str
|
49 |
+
language: str = "en"
|
50 |
+
voice_cleanup: bool = False
|
51 |
+
no_lang_auto_detect: bool = False
|
52 |
|
53 |
+
class XTTSService:
|
54 |
def __init__(self):
|
55 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
56 |
logger.info(f"Using device: {self.device}")
|
57 |
|
58 |
+
# Use the C3PO model path
|
59 |
+
self.model_path = "XTTS-v2_C3PO/"
|
60 |
+
self.config_path = "XTTS-v2_C3PO/config.json"
|
61 |
+
|
62 |
+
# Check if model files exist, if not download them
|
63 |
+
if not os.path.exists(self.config_path):
|
64 |
+
logger.info("C3PO model not found locally, downloading...")
|
65 |
+
self._download_c3po_model()
|
66 |
+
|
67 |
+
# Load configuration
|
68 |
+
config = XttsConfig()
|
69 |
+
config.load_json(self.config_path)
|
70 |
+
|
71 |
+
# Initialize and load model
|
72 |
+
self.model = Xtts.init_from_config(config)
|
73 |
+
self.model.load_checkpoint(
|
74 |
+
config,
|
75 |
+
checkpoint_path=os.path.join(self.model_path, "model.pth"),
|
76 |
+
vocab_path=os.path.join(self.model_path, "vocab.json"),
|
77 |
+
eval=True,
|
78 |
+
)
|
79 |
+
|
80 |
+
if self.device == "cuda":
|
81 |
+
self.model.cuda()
|
82 |
+
|
83 |
+
self.supported_languages = config.languages
|
84 |
+
logger.info(f"XTTS C3PO model loaded successfully. Supported languages: {self.supported_languages}")
|
85 |
+
|
86 |
+
# Set default reference audio (C3PO voice)
|
87 |
+
self.default_reference = os.path.join(self.model_path, "reference.wav")
|
88 |
+
if not os.path.exists(self.default_reference):
|
89 |
+
# Look for any reference audio in the model directory
|
90 |
+
for file in os.listdir(self.model_path):
|
91 |
+
if file.endswith(('.wav', '.mp3', '.m4a')):
|
92 |
+
self.default_reference = os.path.join(self.model_path, file)
|
93 |
+
break
|
94 |
+
else:
|
95 |
+
self.default_reference = None
|
96 |
|
97 |
+
if self.default_reference:
|
98 |
+
logger.info(f"Default C3PO reference audio: {self.default_reference}")
|
99 |
+
else:
|
100 |
+
logger.warning("No default reference audio found in C3PO model directory")
|
101 |
+
|
102 |
+
def _download_c3po_model(self):
|
103 |
+
"""Download the C3PO model from Hugging Face"""
|
104 |
try:
|
105 |
+
logger.info("Downloading C3PO model from Hugging Face...")
|
106 |
+
subprocess.run([
|
107 |
+
"git", "clone",
|
108 |
+
"https://huggingface.co/Borcherding/XTTS-v2_C3PO",
|
109 |
+
"XTTS-v2_C3PO"
|
110 |
+
], check=True)
|
111 |
+
logger.info("C3PO model downloaded successfully")
|
112 |
+
except subprocess.CalledProcessError as e:
|
113 |
+
logger.error(f"Failed to download C3PO model: {e}")
|
114 |
+
raise HTTPException(status_code=500, detail="Failed to download C3PO model")
|
115 |
|
116 |
+
def generate_speech(self, text: str, speaker_wav_path: str = None, language: str = "en",
|
117 |
+
voice_cleanup: bool = False, no_lang_auto_detect: bool = False) -> str:
|
118 |
"""Generate speech and return the path to the output file"""
|
119 |
try:
|
120 |
+
# Use default C3PO voice if no speaker file provided
|
121 |
+
if speaker_wav_path is None:
|
122 |
+
if self.default_reference is None:
|
123 |
+
raise HTTPException(status_code=400, detail="No reference audio available. Please upload a speaker file.")
|
124 |
+
speaker_wav_path = self.default_reference
|
125 |
+
logger.info("Using default C3PO voice")
|
126 |
+
|
127 |
+
# Validate language
|
128 |
+
if language not in self.supported_languages:
|
129 |
+
raise HTTPException(status_code=400, detail=f"Language '{language}' not supported. Supported: {self.supported_languages}")
|
130 |
+
|
131 |
+
# Language detection for longer texts
|
132 |
+
if len(text) > 15 and not no_lang_auto_detect:
|
133 |
+
language_predicted = langid.classify(text)[0].strip()
|
134 |
+
if language_predicted == "zh":
|
135 |
+
language_predicted = "zh-cn"
|
136 |
+
|
137 |
+
if language_predicted != language:
|
138 |
+
logger.warning(f"Detected language: {language_predicted}, chosen: {language}")
|
139 |
+
|
140 |
+
# Text length validation
|
141 |
+
if len(text) < 2:
|
142 |
+
raise HTTPException(status_code=400, detail="Text too short, please provide longer text")
|
143 |
+
|
144 |
+
if len(text) > 500: # Increased limit for API
|
145 |
+
raise HTTPException(status_code=400, detail="Text too long, maximum 500 characters")
|
146 |
+
|
147 |
+
# Voice cleanup if requested
|
148 |
+
processed_speaker_wav = speaker_wav_path
|
149 |
+
if voice_cleanup:
|
150 |
+
processed_speaker_wav = self._cleanup_audio(speaker_wav_path)
|
151 |
+
|
152 |
+
# Generate conditioning latents
|
153 |
+
try:
|
154 |
+
gpt_cond_latent, speaker_embedding = self.model.get_conditioning_latents(
|
155 |
+
audio_path=processed_speaker_wav,
|
156 |
+
gpt_cond_len=30,
|
157 |
+
gpt_cond_chunk_len=4,
|
158 |
+
max_ref_length=60
|
159 |
+
)
|
160 |
+
except Exception as e:
|
161 |
+
logger.error(f"Speaker encoding error: {e}")
|
162 |
+
raise HTTPException(status_code=400, detail="Error processing reference audio. Please check the audio file.")
|
163 |
+
|
164 |
+
# Generate speech
|
165 |
+
logger.info("Generating speech...")
|
166 |
+
start_time = time.time()
|
167 |
|
168 |
+
out = self.model.inference(
|
169 |
+
text,
|
170 |
+
language,
|
171 |
+
gpt_cond_latent,
|
172 |
+
speaker_embedding,
|
173 |
+
repetition_penalty=5.0,
|
174 |
+
temperature=0.75,
|
175 |
+
)
|
176 |
|
177 |
+
inference_time = time.time() - start_time
|
178 |
+
logger.info(f"Speech generation completed in {inference_time:.2f} seconds")
|
179 |
|
180 |
+
# Save output
|
181 |
+
output_filename = f"xtts_c3po_output_{uuid.uuid4().hex}.wav"
|
182 |
+
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
183 |
+
|
184 |
+
torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
|
|
185 |
|
186 |
return output_path
|
187 |
+
|
188 |
except Exception as e:
|
189 |
logger.error(f"Error generating speech: {e}")
|
190 |
+
if isinstance(e, HTTPException):
|
191 |
+
raise e
|
192 |
raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
|
193 |
+
|
194 |
+
def _cleanup_audio(self, audio_path: str) -> str:
|
195 |
+
"""Apply audio cleanup filters"""
|
196 |
+
try:
|
197 |
+
output_path = audio_path + "_cleaned.wav"
|
198 |
+
|
199 |
+
# Basic audio cleanup using ffmpeg-python or similar
|
200 |
+
# For now, just return the original path
|
201 |
+
# You can implement more sophisticated cleanup here
|
202 |
+
|
203 |
+
return audio_path
|
204 |
+
except Exception as e:
|
205 |
+
logger.warning(f"Audio cleanup failed: {e}, using original audio")
|
206 |
+
return audio_path
|
207 |
|
208 |
+
# Initialize XTTS service
|
209 |
+
logger.info("Initializing XTTS C3PO service...")
|
210 |
+
tts_service = XTTSService()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
@app.get("/")
|
213 |
async def root():
|
214 |
+
return {"message": "XTTS C3PO API is running", "status": "healthy", "model": "C3PO"}
|
215 |
|
216 |
@app.get("/health")
|
217 |
async def health_check():
|
218 |
+
return {
|
219 |
+
"status": "healthy",
|
220 |
+
"device": tts_service.device,
|
221 |
+
"model": "XTTS-v2 C3PO",
|
222 |
+
"supported_languages": tts_service.supported_languages,
|
223 |
+
"default_voice": "C3PO" if tts_service.default_reference else "None"
|
224 |
+
}
|
225 |
|
226 |
+
@app.get("/languages")
|
227 |
+
async def get_languages():
|
228 |
+
"""Get list of supported languages"""
|
229 |
+
return {"languages": tts_service.supported_languages}
|
230 |
|
231 |
@app.post("/tts")
|
232 |
async def text_to_speech(
|
233 |
text: str = Form(...),
|
234 |
+
language: str = Form("en"),
|
235 |
+
voice_cleanup: bool = Form(False),
|
236 |
+
no_lang_auto_detect: bool = Form(False),
|
237 |
+
speaker_file: UploadFile = File(None)
|
238 |
):
|
239 |
"""
|
240 |
+
Convert text to speech using XTTS C3PO voice cloning
|
241 |
|
242 |
+
- **text**: The text to convert to speech (max 500 characters)
|
243 |
+
- **language**: Language code (default: "en")
|
244 |
+
- **voice_cleanup**: Apply audio cleanup to reference voice
|
245 |
+
- **no_lang_auto_detect**: Disable automatic language detection
|
246 |
+
- **speaker_file**: Reference speaker audio file (optional, uses C3PO voice if not provided)
|
247 |
"""
|
248 |
|
249 |
if not text.strip():
|
250 |
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
251 |
|
252 |
+
speaker_temp_path = None
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
try:
|
255 |
+
# Handle speaker file if provided
|
256 |
+
if speaker_file is not None:
|
257 |
+
# Validate file type
|
258 |
+
if not speaker_file.content_type.startswith('audio/'):
|
259 |
+
raise HTTPException(status_code=400, detail="Speaker file must be an audio file")
|
260 |
+
|
261 |
+
# Save uploaded speaker file temporarily
|
262 |
+
speaker_temp_path = os.path.join(tempfile.gettempdir(), f"speaker_{uuid.uuid4().hex}.wav")
|
263 |
+
|
264 |
+
with open(speaker_temp_path, "wb") as buffer:
|
265 |
+
content = await speaker_file.read()
|
266 |
+
buffer.write(content)
|
267 |
+
|
268 |
+
# Generate speech (will use C3PO voice if no speaker file provided)
|
269 |
+
output_path = tts_service.generate_speech(
|
270 |
+
text,
|
271 |
+
speaker_temp_path,
|
272 |
+
language,
|
273 |
+
voice_cleanup,
|
274 |
+
no_lang_auto_detect
|
275 |
+
)
|
276 |
+
|
277 |
+
# Clean up temporary speaker file
|
278 |
+
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
279 |
+
try:
|
280 |
+
os.remove(speaker_temp_path)
|
281 |
+
except:
|
282 |
+
pass
|
283 |
|
284 |
# Return the generated audio file
|
285 |
+
voice_type = "custom" if speaker_file else "c3po"
|
286 |
return FileResponse(
|
287 |
output_path,
|
288 |
media_type="audio/wav",
|
289 |
+
filename=f"xtts_{voice_type}_output_{uuid.uuid4().hex}.wav",
|
290 |
headers={"Content-Disposition": "attachment"}
|
291 |
)
|
292 |
|
293 |
except Exception as e:
|
294 |
+
# Clean up files in case of error
|
295 |
+
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
296 |
+
try:
|
297 |
+
os.remove(speaker_temp_path)
|
298 |
+
except:
|
299 |
+
pass
|
300 |
+
|
301 |
logger.error(f"Error in TTS endpoint: {e}")
|
302 |
+
if isinstance(e, HTTPException):
|
303 |
+
raise e
|
304 |
raise HTTPException(status_code=500, detail=str(e))
|
305 |
|
306 |
@app.post("/tts-json")
|
307 |
+
async def text_to_speech_json(
|
308 |
+
request: TTSRequest,
|
309 |
+
speaker_file: UploadFile = File(None)
|
310 |
+
):
|
311 |
"""
|
312 |
Convert text to speech using JSON request body
|
313 |
|
314 |
+
- **request**: TTSRequest containing text, language, and options
|
315 |
+
- **speaker_file**: Reference speaker audio file (optional, uses C3PO voice if not provided)
|
316 |
"""
|
317 |
|
318 |
if not request.text.strip():
|
319 |
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
320 |
|
321 |
+
speaker_temp_path = None
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
|
323 |
try:
|
324 |
+
# Handle speaker file if provided
|
325 |
+
if speaker_file is not None:
|
326 |
+
# Validate file type
|
327 |
+
if not speaker_file.content_type.startswith('audio/'):
|
328 |
+
raise HTTPException(status_code=400, detail="Speaker file must be an audio file")
|
329 |
+
|
330 |
+
# Save uploaded speaker file temporarily
|
331 |
+
speaker_temp_path = os.path.join(tempfile.gettempdir(), f"speaker_{uuid.uuid4().hex}.wav")
|
332 |
+
|
333 |
+
with open(speaker_temp_path, "wb") as buffer:
|
334 |
+
content = await speaker_file.read()
|
335 |
+
buffer.write(content)
|
336 |
+
|
337 |
# Generate speech
|
338 |
+
output_path = tts_service.generate_speech(
|
339 |
+
request.text,
|
340 |
+
speaker_temp_path,
|
341 |
+
request.language,
|
342 |
+
request.voice_cleanup,
|
343 |
+
request.no_lang_auto_detect
|
344 |
+
)
|
345 |
+
|
346 |
+
# Clean up temporary speaker file
|
347 |
+
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
348 |
+
try:
|
349 |
+
os.remove(speaker_temp_path)
|
350 |
+
except:
|
351 |
+
pass
|
352 |
|
353 |
# Return the generated audio file
|
354 |
+
voice_type = "custom" if speaker_file else "c3po"
|
355 |
return FileResponse(
|
356 |
output_path,
|
357 |
media_type="audio/wav",
|
358 |
+
filename=f"xtts_{voice_type}_{request.language}_{uuid.uuid4().hex}.wav",
|
359 |
headers={"Content-Disposition": "attachment"}
|
360 |
)
|
361 |
|
362 |
except Exception as e:
|
363 |
+
# Clean up files in case of error
|
364 |
+
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
365 |
+
try:
|
366 |
+
os.remove(speaker_temp_path)
|
367 |
+
except:
|
368 |
+
pass
|
369 |
+
|
370 |
logger.error(f"Error in TTS JSON endpoint: {e}")
|
371 |
+
if isinstance(e, HTTPException):
|
372 |
+
raise e
|
373 |
+
raise HTTPException(status_code=500, detail=str(e))
|
374 |
+
|
375 |
+
@app.post("/tts-c3po")
|
376 |
+
async def text_to_speech_c3po_only(
|
377 |
+
text: str = Form(...),
|
378 |
+
language: str = Form("en"),
|
379 |
+
no_lang_auto_detect: bool = Form(False)
|
380 |
+
):
|
381 |
+
"""
|
382 |
+
Convert text to speech using C3PO voice only (no file upload needed)
|
383 |
+
|
384 |
+
- **text**: The text to convert to speech (max 500 characters)
|
385 |
+
- **language**: Language code (default: "en")
|
386 |
+
- **no_lang_auto_detect**: Disable automatic language detection
|
387 |
+
"""
|
388 |
+
|
389 |
+
if not text.strip():
|
390 |
+
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
391 |
+
|
392 |
+
try:
|
393 |
+
# Generate speech using C3PO voice
|
394 |
+
output_path = tts_service.generate_speech(
|
395 |
+
text,
|
396 |
+
None, # Use default C3PO voice
|
397 |
+
language,
|
398 |
+
False, # No voice cleanup needed for default voice
|
399 |
+
no_lang_auto_detect
|
400 |
+
)
|
401 |
+
|
402 |
+
# Return the generated audio file
|
403 |
+
return FileResponse(
|
404 |
+
output_path,
|
405 |
+
media_type="audio/wav",
|
406 |
+
filename=f"c3po_voice_{uuid.uuid4().hex}.wav",
|
407 |
+
headers={"Content-Disposition": "attachment"}
|
408 |
+
)
|
409 |
+
|
410 |
+
except Exception as e:
|
411 |
+
logger.error(f"Error in C3PO TTS endpoint: {e}")
|
412 |
+
if isinstance(e, HTTPException):
|
413 |
+
raise e
|
414 |
raise HTTPException(status_code=500, detail=str(e))
|
client_example.py
CHANGED
@@ -1,34 +1,34 @@
|
|
1 |
import requests
|
2 |
-
import
|
3 |
|
4 |
-
def
|
5 |
-
"""
|
6 |
|
7 |
-
# API endpoint
|
8 |
-
url = "http://localhost:7860/tts"
|
9 |
|
10 |
-
# Text to convert to speech
|
11 |
-
text = ""
|
12 |
-
[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
|
13 |
-
"""
|
14 |
|
15 |
# Prepare the request data
|
16 |
data = {
|
17 |
"text": text,
|
18 |
-
"
|
19 |
-
"
|
20 |
}
|
21 |
|
22 |
try:
|
23 |
-
print("
|
|
|
|
|
24 |
response = requests.post(url, data=data)
|
25 |
|
26 |
if response.status_code == 200:
|
27 |
# Save the generated audio
|
28 |
-
output_filename = "
|
29 |
with open(output_filename, "wb") as f:
|
30 |
f.write(response.content)
|
31 |
-
print(f"Success!
|
32 |
else:
|
33 |
print(f"Error: {response.status_code}")
|
34 |
print(response.text)
|
@@ -38,36 +38,92 @@ def test_kokoro_tts_api():
|
|
38 |
except Exception as e:
|
39 |
print(f"Error: {e}")
|
40 |
|
41 |
-
def
|
42 |
-
"""Example of using
|
43 |
|
44 |
# API endpoint
|
45 |
-
url = "http://localhost:7860/tts
|
46 |
|
47 |
# Text to convert to speech
|
48 |
-
text = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
# Prepare the
|
51 |
data = {
|
52 |
"text": text,
|
53 |
-
"
|
54 |
-
"
|
|
|
|
|
|
|
|
|
|
|
55 |
}
|
56 |
|
57 |
-
|
58 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
}
|
60 |
|
61 |
try:
|
62 |
-
print("
|
63 |
-
|
|
|
|
|
64 |
|
65 |
if response.status_code == 200:
|
66 |
# Save the generated audio
|
67 |
-
output_filename = "
|
68 |
with open(output_filename, "wb") as f:
|
69 |
f.write(response.content)
|
70 |
-
print(f"Success!
|
71 |
else:
|
72 |
print(f"Error: {response.status_code}")
|
73 |
print(response.text)
|
@@ -77,16 +133,60 @@ def test_kokoro_tts_json_api():
|
|
77 |
except Exception as e:
|
78 |
print(f"Error: {e}")
|
79 |
|
80 |
-
def
|
81 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
try:
|
83 |
-
response = requests.get("http://localhost:7860/
|
84 |
if response.status_code == 200:
|
85 |
-
|
86 |
-
print("
|
87 |
-
return
|
88 |
else:
|
89 |
-
print("Failed to get
|
90 |
return []
|
91 |
except requests.exceptions.ConnectionError:
|
92 |
print("API is not running. Start it with: uvicorn app:app --host 0.0.0.0 --port 7860")
|
@@ -97,7 +197,13 @@ def check_api_health():
|
|
97 |
try:
|
98 |
response = requests.get("http://localhost:7860/health")
|
99 |
if response.status_code == 200:
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
return True
|
102 |
else:
|
103 |
print("API health check failed:", response.status_code)
|
@@ -106,26 +212,58 @@ def check_api_health():
|
|
106 |
print("API is not running. Start it with: uvicorn app:app --host 0.0.0.0 --port 7860")
|
107 |
return False
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
if __name__ == "__main__":
|
110 |
-
print("
|
111 |
-
print("=" *
|
112 |
|
113 |
# First check if API is running
|
114 |
if check_api_health():
|
115 |
print()
|
116 |
|
117 |
-
# Get
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
119 |
print()
|
120 |
|
121 |
-
# Test
|
122 |
-
print("Testing
|
123 |
-
|
124 |
print()
|
125 |
|
126 |
-
# Test
|
127 |
-
print("Testing
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
else:
|
130 |
print("\nPlease start the API server first:")
|
131 |
print("uvicorn app:app --host 0.0.0.0 --port 7860")
|
|
|
1 |
import requests
|
2 |
+
import os
|
3 |
|
4 |
+
def test_c3po_voice():
|
5 |
+
"""Test the C3PO voice without uploading any files"""
|
6 |
|
7 |
+
# API endpoint for C3PO voice only
|
8 |
+
url = "http://localhost:7860/tts-c3po"
|
9 |
|
10 |
+
# Text to convert to speech
|
11 |
+
text = "Hello there! I am C-3PO, human-cyborg relations. How may I assist you today?"
|
|
|
|
|
12 |
|
13 |
# Prepare the request data
|
14 |
data = {
|
15 |
"text": text,
|
16 |
+
"language": "en",
|
17 |
+
"no_lang_auto_detect": False
|
18 |
}
|
19 |
|
20 |
try:
|
21 |
+
print("Testing C3PO voice...")
|
22 |
+
print(f"Text: {text}")
|
23 |
+
|
24 |
response = requests.post(url, data=data)
|
25 |
|
26 |
if response.status_code == 200:
|
27 |
# Save the generated audio
|
28 |
+
output_filename = "c3po_voice_sample.wav"
|
29 |
with open(output_filename, "wb") as f:
|
30 |
f.write(response.content)
|
31 |
+
print(f"Success! C3PO voice sample saved as {output_filename}")
|
32 |
else:
|
33 |
print(f"Error: {response.status_code}")
|
34 |
print(response.text)
|
|
|
38 |
except Exception as e:
|
39 |
print(f"Error: {e}")
|
40 |
|
41 |
+
def test_xtts_with_custom_voice():
|
42 |
+
"""Example of using XTTS with custom voice upload"""
|
43 |
|
44 |
# API endpoint
|
45 |
+
url = "http://localhost:7860/tts"
|
46 |
|
47 |
# Text to convert to speech
|
48 |
+
text = "This is a test of XTTS voice cloning with a custom reference voice."
|
49 |
+
|
50 |
+
# Path to your speaker reference audio file
|
51 |
+
speaker_file_path = "reference.wav" # Update this path to your reference audio
|
52 |
+
|
53 |
+
# Check if speaker file exists
|
54 |
+
if not os.path.exists(speaker_file_path):
|
55 |
+
print(f"Custom voice test skipped: Speaker file not found at {speaker_file_path}")
|
56 |
+
print("To test custom voice cloning:")
|
57 |
+
print("1. Record 3-10 seconds of clear speech")
|
58 |
+
print("2. Save as 'reference.wav' in this directory")
|
59 |
+
print("3. Run this test again")
|
60 |
+
return
|
61 |
|
62 |
+
# Prepare the request data
|
63 |
data = {
|
64 |
"text": text,
|
65 |
+
"language": "en",
|
66 |
+
"voice_cleanup": False,
|
67 |
+
"no_lang_auto_detect": False
|
68 |
+
}
|
69 |
+
|
70 |
+
files = {
|
71 |
+
"speaker_file": open(speaker_file_path, "rb")
|
72 |
}
|
73 |
|
74 |
+
try:
|
75 |
+
print("Testing XTTS with custom voice...")
|
76 |
+
print(f"Text: {text}")
|
77 |
+
print(f"Speaker file: {speaker_file_path}")
|
78 |
+
|
79 |
+
response = requests.post(url, data=data, files=files)
|
80 |
+
|
81 |
+
if response.status_code == 200:
|
82 |
+
# Save the generated audio
|
83 |
+
output_filename = "custom_voice_clone.wav"
|
84 |
+
with open(output_filename, "wb") as f:
|
85 |
+
f.write(response.content)
|
86 |
+
print(f"Success! Custom voice clone saved as {output_filename}")
|
87 |
+
else:
|
88 |
+
print(f"Error: {response.status_code}")
|
89 |
+
print(response.text)
|
90 |
+
|
91 |
+
except requests.exceptions.ConnectionError:
|
92 |
+
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
|
93 |
+
except Exception as e:
|
94 |
+
print(f"Error: {e}")
|
95 |
+
finally:
|
96 |
+
files["speaker_file"].close()
|
97 |
+
|
98 |
+
def test_xtts_fallback_to_c3po():
|
99 |
+
"""Test XTTS endpoint without speaker file (should use C3PO voice)"""
|
100 |
+
|
101 |
+
# API endpoint
|
102 |
+
url = "http://localhost:7860/tts"
|
103 |
+
|
104 |
+
# Text to convert to speech
|
105 |
+
text = "When no custom voice is provided, I will speak in the C3PO voice by default."
|
106 |
+
|
107 |
+
# Prepare the request data (no speaker file)
|
108 |
+
data = {
|
109 |
+
"text": text,
|
110 |
+
"language": "en",
|
111 |
+
"voice_cleanup": False,
|
112 |
+
"no_lang_auto_detect": False
|
113 |
}
|
114 |
|
115 |
try:
|
116 |
+
print("Testing XTTS fallback to C3PO voice...")
|
117 |
+
print(f"Text: {text}")
|
118 |
+
|
119 |
+
response = requests.post(url, data=data)
|
120 |
|
121 |
if response.status_code == 200:
|
122 |
# Save the generated audio
|
123 |
+
output_filename = "xtts_c3po_fallback.wav"
|
124 |
with open(output_filename, "wb") as f:
|
125 |
f.write(response.content)
|
126 |
+
print(f"Success! XTTS with C3PO fallback saved as {output_filename}")
|
127 |
else:
|
128 |
print(f"Error: {response.status_code}")
|
129 |
print(response.text)
|
|
|
133 |
except Exception as e:
|
134 |
print(f"Error: {e}")
|
135 |
|
136 |
+
def test_multilingual_c3po():
|
137 |
+
"""Test C3PO voice in different languages"""
|
138 |
+
|
139 |
+
# API endpoint for C3PO voice only
|
140 |
+
url = "http://localhost:7860/tts-c3po"
|
141 |
+
|
142 |
+
# Test different languages
|
143 |
+
test_cases = [
|
144 |
+
("en", "Hello, I am C-3PO. I am fluent in over six million forms of communication."),
|
145 |
+
("es", "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación."),
|
146 |
+
("fr", "Bonjour, je suis C-3PO. Je maîtrise plus de six millions de formes de communication."),
|
147 |
+
("de", "Hallo, ich bin C-3PO. Ich beherrsche über sechs Millionen Kommunikationsformen."),
|
148 |
+
]
|
149 |
+
|
150 |
+
for language, text in test_cases:
|
151 |
+
data = {
|
152 |
+
"text": text,
|
153 |
+
"language": language,
|
154 |
+
"no_lang_auto_detect": True # Force the specified language
|
155 |
+
}
|
156 |
+
|
157 |
+
try:
|
158 |
+
print(f"Testing C3PO voice in {language.upper()}...")
|
159 |
+
print(f"Text: {text}")
|
160 |
+
|
161 |
+
response = requests.post(url, data=data)
|
162 |
+
|
163 |
+
if response.status_code == 200:
|
164 |
+
# Save the generated audio
|
165 |
+
output_filename = f"c3po_voice_{language}.wav"
|
166 |
+
with open(output_filename, "wb") as f:
|
167 |
+
f.write(response.content)
|
168 |
+
print(f"Success! C3PO {language} voice saved as {output_filename}")
|
169 |
+
else:
|
170 |
+
print(f"Error: {response.status_code}")
|
171 |
+
print(response.text)
|
172 |
+
|
173 |
+
except requests.exceptions.ConnectionError:
|
174 |
+
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
|
175 |
+
except Exception as e:
|
176 |
+
print(f"Error: {e}")
|
177 |
+
|
178 |
+
print() # Add spacing between tests
|
179 |
+
|
180 |
+
def get_supported_languages():
|
181 |
+
"""Get list of supported languages"""
|
182 |
try:
|
183 |
+
response = requests.get("http://localhost:7860/languages")
|
184 |
if response.status_code == 200:
|
185 |
+
languages = response.json()
|
186 |
+
print("Supported languages:", languages["languages"])
|
187 |
+
return languages["languages"]
|
188 |
else:
|
189 |
+
print("Failed to get languages:", response.status_code)
|
190 |
return []
|
191 |
except requests.exceptions.ConnectionError:
|
192 |
print("API is not running. Start it with: uvicorn app:app --host 0.0.0.0 --port 7860")
|
|
|
197 |
try:
|
198 |
response = requests.get("http://localhost:7860/health")
|
199 |
if response.status_code == 200:
|
200 |
+
health_info = response.json()
|
201 |
+
print("API Health Check:")
|
202 |
+
print(f" Status: {health_info['status']}")
|
203 |
+
print(f" Device: {health_info['device']}")
|
204 |
+
print(f" Model: {health_info['model']}")
|
205 |
+
print(f" Default Voice: {health_info['default_voice']}")
|
206 |
+
print(f" Languages: {len(health_info['supported_languages'])} supported")
|
207 |
return True
|
208 |
else:
|
209 |
print("API health check failed:", response.status_code)
|
|
|
212 |
print("API is not running. Start it with: uvicorn app:app --host 0.0.0.0 --port 7860")
|
213 |
return False
|
214 |
|
215 |
+
def create_sample_reference():
|
216 |
+
"""Instructions for creating a reference audio file"""
|
217 |
+
print("\n" + "="*50)
|
218 |
+
print("REFERENCE AUDIO SETUP")
|
219 |
+
print("="*50)
|
220 |
+
print("To use XTTS voice cloning, you need a reference audio file:")
|
221 |
+
print("1. Record 3-10 seconds of clear speech")
|
222 |
+
print("2. Save as WAV format (recommended)")
|
223 |
+
print("3. Ensure good audio quality (no background noise)")
|
224 |
+
print("4. Place the file in the same directory as this script")
|
225 |
+
print("5. Update the 'speaker_file_path' variable in the functions above")
|
226 |
+
print("\nExample recording text:")
|
227 |
+
print("'Hello, this is my voice. I'm recording this sample for voice cloning.'")
|
228 |
+
print("="*50)
|
229 |
+
|
230 |
if __name__ == "__main__":
|
231 |
+
print("XTTS C3PO API Client Example")
|
232 |
+
print("=" * 40)
|
233 |
|
234 |
# First check if API is running
|
235 |
if check_api_health():
|
236 |
print()
|
237 |
|
238 |
+
# Get supported languages
|
239 |
+
languages = get_supported_languages()
|
240 |
+
print()
|
241 |
+
|
242 |
+
# Test C3PO voice (no file upload needed)
|
243 |
+
print("1. Testing C3PO voice (no upload required)...")
|
244 |
+
test_c3po_voice()
|
245 |
print()
|
246 |
|
247 |
+
# Test XTTS fallback to C3PO
|
248 |
+
print("2. Testing XTTS endpoint without speaker file (C3PO fallback)...")
|
249 |
+
test_xtts_fallback_to_c3po()
|
250 |
print()
|
251 |
|
252 |
+
# Test custom voice if reference file exists
|
253 |
+
print("3. Testing custom voice cloning...")
|
254 |
+
test_xtts_with_custom_voice()
|
255 |
+
print()
|
256 |
+
|
257 |
+
# Test multilingual C3PO
|
258 |
+
print("4. Testing multilingual C3PO voice...")
|
259 |
+
test_multilingual_c3po()
|
260 |
+
|
261 |
+
print("All tests completed!")
|
262 |
+
print("\nGenerated files:")
|
263 |
+
for file in os.listdir("."):
|
264 |
+
if file.endswith(".wav") and ("c3po" in file or "custom" in file or "xtts" in file):
|
265 |
+
print(f" - {file}")
|
266 |
+
|
267 |
else:
|
268 |
print("\nPlease start the API server first:")
|
269 |
print("uvicorn app:app --host 0.0.0.0 --port 7860")
|
requirements.txt
CHANGED
@@ -1,7 +1,17 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
fastapi
|
4 |
uvicorn[standard]
|
5 |
-
python-multipart
|
6 |
torch
|
7 |
-
torchaudio
|
|
|
|
|
|
|
|
1 |
+
TTS @ git+https://github.com/coqui-ai/TTS@v0.21.1
|
2 |
+
pydantic==1.10.13
|
3 |
+
python-multipart==0.0.6
|
4 |
+
typing-extensions>=4.8.0
|
5 |
+
cutlet
|
6 |
+
mecab-python3==1.0.6
|
7 |
+
unidic-lite==1.0.8
|
8 |
+
unidic==1.1.0
|
9 |
+
langid
|
10 |
+
pydub
|
11 |
fastapi
|
12 |
uvicorn[standard]
|
|
|
13 |
torch
|
14 |
+
torchaudio
|
15 |
+
soundfile
|
16 |
+
scipy
|
17 |
+
numpy
|
test.py
CHANGED
@@ -1,28 +1,145 @@
|
|
1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
# Set
|
|
|
4 |
os.environ['NUMBA_DISABLE_JIT'] = '1'
|
5 |
|
6 |
-
from
|
7 |
-
|
8 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
#
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# Text to convert to speech
|
14 |
-
text =
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
#
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
print(
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
print("
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import torch
|
3 |
+
import torchaudio
|
4 |
+
import subprocess
|
5 |
+
|
6 |
+
# Fix PyTorch weights_only issue for XTTS
|
7 |
+
import torch.serialization
|
8 |
+
from TTS.tts.configs.xtts_config import XttsConfig
|
9 |
+
torch.serialization.add_safe_globals([XttsConfig])
|
10 |
|
11 |
+
# Set environment variables
|
12 |
+
os.environ['COQUI_TOS_AGREED'] = '1'
|
13 |
os.environ['NUMBA_DISABLE_JIT'] = '1'
|
14 |
|
15 |
+
from TTS.api import TTS
|
16 |
+
from TTS.tts.configs.xtts_config import XttsConfig
|
17 |
+
from TTS.tts.models.xtts import Xtts
|
18 |
+
from TTS.utils.generic_utils import get_user_data_dir
|
19 |
+
|
20 |
+
print("Testing XTTS C3PO voice cloning...")
|
21 |
+
|
22 |
+
# C3PO model path
|
23 |
+
model_path = "XTTS-v2_C3PO/"
|
24 |
+
config_path = "XTTS-v2_C3PO/config.json"
|
25 |
|
26 |
+
# Check if model files exist, if not download them
|
27 |
+
if not os.path.exists(config_path):
|
28 |
+
print("C3PO model not found locally, downloading...")
|
29 |
+
try:
|
30 |
+
subprocess.run([
|
31 |
+
"git", "clone",
|
32 |
+
"https://huggingface.co/Borcherding/XTTS-v2_C3PO",
|
33 |
+
"XTTS-v2_C3PO"
|
34 |
+
], check=True)
|
35 |
+
print("C3PO model downloaded successfully")
|
36 |
+
except subprocess.CalledProcessError as e:
|
37 |
+
print(f"Failed to download C3PO model: {e}")
|
38 |
+
exit(1)
|
39 |
+
|
40 |
+
# Load configuration
|
41 |
+
config = XttsConfig()
|
42 |
+
config.load_json(config_path)
|
43 |
+
|
44 |
+
# Initialize and load model
|
45 |
+
model = Xtts.init_from_config(config)
|
46 |
+
model.load_checkpoint(
|
47 |
+
config,
|
48 |
+
checkpoint_path=os.path.join(model_path, "model.pth"),
|
49 |
+
vocab_path=os.path.join(model_path, "vocab.json"),
|
50 |
+
eval=True,
|
51 |
+
)
|
52 |
+
|
53 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
54 |
+
if device == "cuda":
|
55 |
+
model.cuda()
|
56 |
+
|
57 |
+
print(f"C3PO model loaded on {device}")
|
58 |
|
59 |
# Text to convert to speech
|
60 |
+
text = "Hello there! I am C-3PO, human-cyborg relations. How may I assist you today?"
|
61 |
+
|
62 |
+
# Look for reference audio in the C3PO model directory
|
63 |
+
reference_audio_path = None
|
64 |
+
for file in os.listdir(model_path):
|
65 |
+
if file.endswith(('.wav', '.mp3', '.m4a')):
|
66 |
+
reference_audio_path = os.path.join(model_path, file)
|
67 |
+
print(f"Found C3PO reference audio: {file}")
|
68 |
+
break
|
69 |
|
70 |
+
# If no reference audio found, create a simple test reference
|
71 |
+
if reference_audio_path is None:
|
72 |
+
print("No reference audio found in C3PO model, creating test reference...")
|
73 |
+
reference_audio_path = "test_reference.wav"
|
74 |
+
|
75 |
+
# Generate a simple sine wave as placeholder
|
76 |
+
import numpy as np
|
77 |
+
sample_rate = 24000
|
78 |
+
duration = 3 # seconds
|
79 |
+
frequency = 440 # Hz
|
80 |
+
t = np.linspace(0, duration, int(sample_rate * duration))
|
81 |
+
audio_data = 0.3 * np.sin(2 * np.pi * frequency * t)
|
82 |
+
|
83 |
+
# Save as WAV
|
84 |
+
torchaudio.save(reference_audio_path, torch.tensor(audio_data).unsqueeze(0), sample_rate)
|
85 |
+
print(f"Test reference audio created: {reference_audio_path}")
|
86 |
|
87 |
+
try:
|
88 |
+
# Generate conditioning latents
|
89 |
+
print("Processing reference audio...")
|
90 |
+
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
|
91 |
+
audio_path=reference_audio_path,
|
92 |
+
gpt_cond_len=30,
|
93 |
+
gpt_cond_chunk_len=4,
|
94 |
+
max_ref_length=60
|
95 |
+
)
|
96 |
+
|
97 |
+
# Generate speech
|
98 |
+
print("Generating C3PO speech...")
|
99 |
+
out = model.inference(
|
100 |
+
text,
|
101 |
+
"en", # language
|
102 |
+
gpt_cond_latent,
|
103 |
+
speaker_embedding,
|
104 |
+
repetition_penalty=5.0,
|
105 |
+
temperature=0.75,
|
106 |
+
)
|
107 |
+
|
108 |
+
# Save output
|
109 |
+
output_path = "c3po_test_output.wav"
|
110 |
+
torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
111 |
+
print(f"C3PO speech generated successfully! Saved as: {output_path}")
|
112 |
+
|
113 |
+
# Test multilingual capabilities
|
114 |
+
print("\nTesting multilingual C3PO...")
|
115 |
+
multilingual_tests = [
|
116 |
+
("es", "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación."),
|
117 |
+
("fr", "Bonjour, je suis C-3PO. Je maîtrise plus de six millions de formes de communication."),
|
118 |
+
("de", "Hallo, ich bin C-3PO. Ich beherrsche über sechs Millionen Kommunikationsformen."),
|
119 |
+
]
|
120 |
+
|
121 |
+
for lang, test_text in multilingual_tests:
|
122 |
+
print(f"Generating {lang.upper()} speech...")
|
123 |
+
out = model.inference(
|
124 |
+
test_text,
|
125 |
+
lang,
|
126 |
+
gpt_cond_latent,
|
127 |
+
speaker_embedding,
|
128 |
+
repetition_penalty=5.0,
|
129 |
+
temperature=0.75,
|
130 |
+
)
|
131 |
+
|
132 |
+
output_path = f"c3po_test_{lang}.wav"
|
133 |
+
torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
134 |
+
print(f"C3PO {lang.upper()} speech saved as: {output_path}")
|
135 |
+
|
136 |
+
except Exception as e:
|
137 |
+
print(f"Error during speech generation: {e}")
|
138 |
+
import traceback
|
139 |
+
traceback.print_exc()
|
140 |
|
141 |
+
print("XTTS C3PO test completed!")
|
142 |
+
print("\nGenerated files:")
|
143 |
+
for file in os.listdir("."):
|
144 |
+
if file.startswith("c3po_test") and file.endswith(".wav"):
|
145 |
+
print(f" - {file}")
|