fdaudens HF Staff commited on
Commit
a132885
·
verified ·
1 Parent(s): f50b82e

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +33 -12
  2. kokoro_text_to_audio.py +81 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,12 +1,33 @@
1
- ---
2
- title: Kokoro Mcp
3
- emoji: 🐠
4
- colorFrom: yellow
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.28.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Kokoro Text-to-Audio App
2
+
3
+ A simple Gradio application that uses the hexgrad/Kokoro-82M model to convert text to audio.
4
+
5
+ ## Setup Instructions
6
+
7
+ 1. Install the required dependencies:
8
+ ```
9
+ pip install -r requirements.txt
10
+ ```
11
+
12
+ 2. Run the application:
13
+ ```
14
+ python kokoro_text_to_audio.py
15
+ ```
16
+
17
+ 3. Open your web browser and navigate to the URL displayed in the terminal (typically http://127.0.0.1:7860)
18
+
19
+ ## Features
20
+
21
+ - Simple text input box for entering the text you want to convert to audio
22
+ - Adjustable speech speed slider
23
+ - Audio playback directly in the browser
24
+
25
+ ## Requirements
26
+
27
+ - Python 3.8 or higher
28
+ - GPU is recommended for faster generation, but not required
29
+ - Internet connection (to download the model on first run)
30
+
31
+ ## Model Information
32
+
33
+ This app uses the [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) model from Hugging Face.
kokoro_text_to_audio.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForTextToWaveform, AutoProcessor
4
+
5
+ # Load model and processor
6
+ model_name = "hexgrad/Kokoro-82M"
7
+ processor = AutoProcessor.from_pretrained(model_name)
8
+ model = AutoModelForTextToWaveform.from_pretrained(model_name, torch_dtype=torch.float16)
9
+
10
+ # Move to GPU if available
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ model = model.to(device)
13
+
14
+ def text_to_audio(text, speed=1.0):
15
+ """Convert text to audio using Kokoro model"""
16
+ # Process the input text
17
+ inputs = processor(text=text, return_tensors="pt")
18
+ inputs = {k: v.to(device) for k, v in inputs.items()}
19
+
20
+ # Set generation parameters
21
+ gen_kwargs = {
22
+ "do_sample": True,
23
+ "temperature": 0.7,
24
+ "length_penalty": 1.0,
25
+ "repetition_penalty": 2.0,
26
+ "top_p": 0.9,
27
+ }
28
+
29
+ # Generate waveform
30
+ with torch.no_grad():
31
+ waveform = model.generate(**inputs, **gen_kwargs).cpu().numpy()[0]
32
+
33
+ # Create a sample rate (typical for audio is 24000)
34
+ sample_rate = 24000
35
+
36
+ # Apply speed factor if needed
37
+ if speed != 1.0:
38
+ import numpy as np
39
+ import librosa
40
+ waveform = librosa.effects.time_stretch(waveform.astype(np.float32), rate=speed)
41
+
42
+ return sample_rate, waveform
43
+
44
+ # Create Gradio interface
45
+ with gr.Blocks(title="Kokoro Text-to-Audio") as app:
46
+ gr.Markdown("# 🎵 Kokoro Text-to-Audio Converter")
47
+ gr.Markdown("Convert text to speech using hexgrad/Kokoro-82M model")
48
+
49
+ with gr.Row():
50
+ with gr.Column():
51
+ text_input = gr.Textbox(
52
+ label="Enter your text",
53
+ placeholder="Type something to convert to audio...",
54
+ lines=5
55
+ )
56
+ speed_slider = gr.Slider(
57
+ minimum=0.5,
58
+ maximum=1.5,
59
+ value=1.0,
60
+ step=0.1,
61
+ label="Speech Speed"
62
+ )
63
+ submit_btn = gr.Button("Generate Audio")
64
+
65
+ with gr.Column():
66
+ audio_output = gr.Audio(label="Generated Audio", type="numpy")
67
+
68
+ submit_btn.click(
69
+ fn=text_to_audio,
70
+ inputs=[text_input, speed_slider],
71
+ outputs=[audio_output]
72
+ )
73
+
74
+ gr.Markdown("### Usage Tips")
75
+ gr.Markdown("- For best results, keep your text reasonably short")
76
+ gr.Markdown("- Adjust the speed slider to modify the pace of speech")
77
+ gr.Markdown("- The model may take a moment to load on first use")
78
+
79
+ # Launch the app
80
+ if __name__ == "__main__":
81
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=3.50.2
2
+ torch>=2.0.0
3
+ transformers>=4.34.0
4
+ librosa>=0.10.0
5
+ numpy>=1.22.0