fdaudens HF Staff commited on
Commit
11bfd4b
·
verified ·
1 Parent(s): ed91a1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -35
app.py CHANGED
@@ -1,50 +1,49 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoModelForTextToWaveform, AutoProcessor
 
4
 
5
- # Load model and processor
6
- model_name = "hexgrad/Kokoro-82M"
7
- processor = AutoProcessor.from_pretrained(model_name)
8
- model = AutoModelForTextToWaveform.from_pretrained(model_name, torch_dtype=torch.float16)
9
 
10
- # Move to GPU if available
11
- device = "cuda" if torch.cuda.is_available() else "cpu"
12
- model = model.to(device)
 
 
 
 
 
13
 
14
  def text_to_audio(text, speed=1.0):
15
  """Convert text to audio using Kokoro model"""
16
- # Process the input text
17
- inputs = processor(text=text, return_tensors="pt")
18
- inputs = {k: v.to(device) for k, v in inputs.items()}
19
-
20
- # Set generation parameters
21
- gen_kwargs = {
22
- "do_sample": True,
23
- "temperature": 0.7,
24
- "length_penalty": 1.0,
25
- "repetition_penalty": 2.0,
26
- "top_p": 0.9,
27
- }
28
 
29
- # Generate waveform
30
- with torch.no_grad():
31
- waveform = model.generate(**inputs, **gen_kwargs).cpu().numpy()[0]
32
 
33
- # Create a sample rate (typical for audio is 24000)
34
- sample_rate = 24000
35
 
36
- # Apply speed factor if needed
37
- if speed != 1.0:
38
- import numpy as np
39
- import librosa
40
- waveform = librosa.effects.time_stretch(waveform.astype(np.float32), rate=speed)
 
 
 
 
 
 
41
 
42
- return sample_rate, waveform
43
 
44
  # Create Gradio interface
45
  with gr.Blocks(title="Kokoro Text-to-Audio") as app:
46
  gr.Markdown("# 🎵 Kokoro Text-to-Audio Converter")
47
- gr.Markdown("Convert text to speech using hexgrad/Kokoro-82M model")
48
 
49
  with gr.Row():
50
  with gr.Column():
@@ -55,7 +54,7 @@ with gr.Blocks(title="Kokoro Text-to-Audio") as app:
55
  )
56
  speed_slider = gr.Slider(
57
  minimum=0.5,
58
- maximum=1.5,
59
  value=1.0,
60
  step=0.1,
61
  label="Speech Speed"
@@ -72,10 +71,10 @@ with gr.Blocks(title="Kokoro Text-to-Audio") as app:
72
  )
73
 
74
  gr.Markdown("### Usage Tips")
75
- gr.Markdown("- For best results, keep your text reasonably short")
76
  gr.Markdown("- Adjust the speed slider to modify the pace of speech")
77
  gr.Markdown("- The model may take a moment to load on first use")
78
 
79
  # Launch the app
80
  if __name__ == "__main__":
81
- app.launch()
 
1
  import gradio as gr
2
  import torch
3
+ import numpy as np
4
+ from kokoro import KModel, KPipeline
5
 
6
+ # Check if CUDA is available
7
+ CUDA_AVAILABLE = torch.cuda.is_available()
 
 
8
 
9
+ # Initialize the model
10
+ model = KModel().to('cuda' if CUDA_AVAILABLE else 'cpu').eval()
11
+
12
+ # Initialize pipelines for different language codes (using 'a' for English)
13
+ pipelines = {'a': KPipeline(lang_code='a', model=False)}
14
+
15
+ # Custom pronunciation for "kokoro"
16
+ pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
17
 
18
  def text_to_audio(text, speed=1.0):
19
  """Convert text to audio using Kokoro model"""
20
+ if not text:
21
+ return None
 
 
 
 
 
 
 
 
 
 
22
 
23
+ pipeline = pipelines['a'] # Use English pipeline
24
+ voice = "af_heart" # Default voice (US English, female, Heart)
 
25
 
26
+ # Process the text
27
+ pack = pipeline.load_voice(voice)
28
 
29
+ for _, ps, _ in pipeline(text, voice, speed):
30
+ ref_s = pack[len(ps)-1]
31
+
32
+ # Generate audio
33
+ try:
34
+ audio = model(ps, ref_s, speed)
35
+ except Exception as e:
36
+ raise gr.Error(f"Error generating audio: {str(e)}")
37
+
38
+ # Return the audio with 24kHz sample rate
39
+ return 24000, audio.numpy()
40
 
41
+ return None
42
 
43
  # Create Gradio interface
44
  with gr.Blocks(title="Kokoro Text-to-Audio") as app:
45
  gr.Markdown("# 🎵 Kokoro Text-to-Audio Converter")
46
+ gr.Markdown("Convert text to speech using the Kokoro-82M model")
47
 
48
  with gr.Row():
49
  with gr.Column():
 
54
  )
55
  speed_slider = gr.Slider(
56
  minimum=0.5,
57
+ maximum=2.0,
58
  value=1.0,
59
  step=0.1,
60
  label="Speech Speed"
 
71
  )
72
 
73
  gr.Markdown("### Usage Tips")
74
+ gr.Markdown("- For best results, keep your text reasonably short (up to ~500 characters)")
75
  gr.Markdown("- Adjust the speed slider to modify the pace of speech")
76
  gr.Markdown("- The model may take a moment to load on first use")
77
 
78
  # Launch the app
79
  if __name__ == "__main__":
80
+ app.launch()