Spaces:
Running
Running
Upload 3 files
Browse files- README.md +33 -12
- kokoro_text_to_audio.py +81 -0
- requirements.txt +5 -0
README.md
CHANGED
@@ -1,12 +1,33 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Kokoro Text-to-Audio App
|
2 |
+
|
3 |
+
A simple Gradio application that uses the hexgrad/Kokoro-82M model to convert text to audio.
|
4 |
+
|
5 |
+
## Setup Instructions
|
6 |
+
|
7 |
+
1. Install the required dependencies:
|
8 |
+
```
|
9 |
+
pip install -r requirements.txt
|
10 |
+
```
|
11 |
+
|
12 |
+
2. Run the application:
|
13 |
+
```
|
14 |
+
python kokoro_text_to_audio.py
|
15 |
+
```
|
16 |
+
|
17 |
+
3. Open your web browser and navigate to the URL displayed in the terminal (typically http://127.0.0.1:7860)
|
18 |
+
|
19 |
+
## Features
|
20 |
+
|
21 |
+
- Simple text input box for entering the text you want to convert to audio
|
22 |
+
- Adjustable speech speed slider
|
23 |
+
- Audio playback directly in the browser
|
24 |
+
|
25 |
+
## Requirements
|
26 |
+
|
27 |
+
- Python 3.8 or higher
|
28 |
+
- GPU is recommended for faster generation, but not required
|
29 |
+
- Internet connection (to download the model on first run)
|
30 |
+
|
31 |
+
## Model Information
|
32 |
+
|
33 |
+
This app uses the [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) model from Hugging Face.
|
kokoro_text_to_audio.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from transformers import AutoModelForTextToWaveform, AutoProcessor
|
4 |
+
|
5 |
+
# Load model and processor
|
6 |
+
model_name = "hexgrad/Kokoro-82M"
|
7 |
+
processor = AutoProcessor.from_pretrained(model_name)
|
8 |
+
model = AutoModelForTextToWaveform.from_pretrained(model_name, torch_dtype=torch.float16)
|
9 |
+
|
10 |
+
# Move to GPU if available
|
11 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
12 |
+
model = model.to(device)
|
13 |
+
|
14 |
+
def text_to_audio(text, speed=1.0):
|
15 |
+
"""Convert text to audio using Kokoro model"""
|
16 |
+
# Process the input text
|
17 |
+
inputs = processor(text=text, return_tensors="pt")
|
18 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
19 |
+
|
20 |
+
# Set generation parameters
|
21 |
+
gen_kwargs = {
|
22 |
+
"do_sample": True,
|
23 |
+
"temperature": 0.7,
|
24 |
+
"length_penalty": 1.0,
|
25 |
+
"repetition_penalty": 2.0,
|
26 |
+
"top_p": 0.9,
|
27 |
+
}
|
28 |
+
|
29 |
+
# Generate waveform
|
30 |
+
with torch.no_grad():
|
31 |
+
waveform = model.generate(**inputs, **gen_kwargs).cpu().numpy()[0]
|
32 |
+
|
33 |
+
# Create a sample rate (typical for audio is 24000)
|
34 |
+
sample_rate = 24000
|
35 |
+
|
36 |
+
# Apply speed factor if needed
|
37 |
+
if speed != 1.0:
|
38 |
+
import numpy as np
|
39 |
+
import librosa
|
40 |
+
waveform = librosa.effects.time_stretch(waveform.astype(np.float32), rate=speed)
|
41 |
+
|
42 |
+
return sample_rate, waveform
|
43 |
+
|
44 |
+
# Create Gradio interface
|
45 |
+
with gr.Blocks(title="Kokoro Text-to-Audio") as app:
|
46 |
+
gr.Markdown("# 🎵 Kokoro Text-to-Audio Converter")
|
47 |
+
gr.Markdown("Convert text to speech using hexgrad/Kokoro-82M model")
|
48 |
+
|
49 |
+
with gr.Row():
|
50 |
+
with gr.Column():
|
51 |
+
text_input = gr.Textbox(
|
52 |
+
label="Enter your text",
|
53 |
+
placeholder="Type something to convert to audio...",
|
54 |
+
lines=5
|
55 |
+
)
|
56 |
+
speed_slider = gr.Slider(
|
57 |
+
minimum=0.5,
|
58 |
+
maximum=1.5,
|
59 |
+
value=1.0,
|
60 |
+
step=0.1,
|
61 |
+
label="Speech Speed"
|
62 |
+
)
|
63 |
+
submit_btn = gr.Button("Generate Audio")
|
64 |
+
|
65 |
+
with gr.Column():
|
66 |
+
audio_output = gr.Audio(label="Generated Audio", type="numpy")
|
67 |
+
|
68 |
+
submit_btn.click(
|
69 |
+
fn=text_to_audio,
|
70 |
+
inputs=[text_input, speed_slider],
|
71 |
+
outputs=[audio_output]
|
72 |
+
)
|
73 |
+
|
74 |
+
gr.Markdown("### Usage Tips")
|
75 |
+
gr.Markdown("- For best results, keep your text reasonably short")
|
76 |
+
gr.Markdown("- Adjust the speed slider to modify the pace of speech")
|
77 |
+
gr.Markdown("- The model may take a moment to load on first use")
|
78 |
+
|
79 |
+
# Launch the app
|
80 |
+
if __name__ == "__main__":
|
81 |
+
app.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=3.50.2
|
2 |
+
torch>=2.0.0
|
3 |
+
transformers>=4.34.0
|
4 |
+
librosa>=0.10.0
|
5 |
+
numpy>=1.22.0
|