Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import AutoModelForTextToWaveform, AutoProcessor | |
# Load model and processor | |
model_name = "hexgrad/Kokoro-82M" | |
processor = AutoProcessor.from_pretrained(model_name) | |
model = AutoModelForTextToWaveform.from_pretrained(model_name, torch_dtype=torch.float16) | |
# Move to GPU if available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
def text_to_audio(text, speed=1.0): | |
"""Convert text to audio using Kokoro model""" | |
# Process the input text | |
inputs = processor(text=text, return_tensors="pt") | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
# Set generation parameters | |
gen_kwargs = { | |
"do_sample": True, | |
"temperature": 0.7, | |
"length_penalty": 1.0, | |
"repetition_penalty": 2.0, | |
"top_p": 0.9, | |
} | |
# Generate waveform | |
with torch.no_grad(): | |
waveform = model.generate(**inputs, **gen_kwargs).cpu().numpy()[0] | |
# Create a sample rate (typical for audio is 24000) | |
sample_rate = 24000 | |
# Apply speed factor if needed | |
if speed != 1.0: | |
import numpy as np | |
import librosa | |
waveform = librosa.effects.time_stretch(waveform.astype(np.float32), rate=speed) | |
return sample_rate, waveform | |
# Create Gradio interface | |
with gr.Blocks(title="Kokoro Text-to-Audio") as app: | |
gr.Markdown("# 🎵 Kokoro Text-to-Audio Converter") | |
gr.Markdown("Convert text to speech using hexgrad/Kokoro-82M model") | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="Enter your text", | |
placeholder="Type something to convert to audio...", | |
lines=5 | |
) | |
speed_slider = gr.Slider( | |
minimum=0.5, | |
maximum=1.5, | |
value=1.0, | |
step=0.1, | |
label="Speech Speed" | |
) | |
submit_btn = gr.Button("Generate Audio") | |
with gr.Column(): | |
audio_output = gr.Audio(label="Generated Audio", type="numpy") | |
submit_btn.click( | |
fn=text_to_audio, | |
inputs=[text_input, speed_slider], | |
outputs=[audio_output] | |
) | |
gr.Markdown("### Usage Tips") | |
gr.Markdown("- For best results, keep your text reasonably short") | |
gr.Markdown("- Adjust the speed slider to modify the pace of speech") | |
gr.Markdown("- The model may take a moment to load on first use") | |
# Launch the app | |
if __name__ == "__main__": | |
app.launch() | |