Hassan-16 commited on
Commit
98c217d
·
verified ·
1 Parent(s): dfcc441

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -9
app.py CHANGED
@@ -1,12 +1,105 @@
1
- import zipfile
 
2
  import os
 
 
 
3
 
4
- zip_path = "model/kokoro-v1_0.zip"
5
- extract_dir = "model/"
 
 
6
 
7
- if zipfile.is_zipfile(zip_path):
8
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
9
- zip_ref.extractall(extract_dir)
10
- print("Extraction completed.")
11
- else:
12
- print("Not a valid ZIP file.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from kokoro import KModel, KPipeline
2
+ import gradio as gr
3
  import os
4
+ import random
5
+ import torch
6
+ import logging
7
 
8
+ # Configuration
9
+ VOICE_DIR = "model/voices"
10
+ OUTPUT_DIR = "output_audio"
11
+ TEXT = "Hello, this is a test of the Kokoro TTS system."
12
 
13
+ # Configure logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Device setup
18
+ CUDA_AVAILABLE = torch.cuda.is_available()
19
+ device = "cuda" if CUDA_AVAILABLE else "cpu"
20
+ logger.info(f"Using hardware: {device}")
21
+
22
+ # Load models for CPU and GPU (if available)
23
+ models = {gpu: KModel("hexgrad/Kokoro-82M").to("cuda" if gpu else "cpu").eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
24
+
25
+ # Define pipelines for American ('a') and British ('b') English
26
+ pipelines = {
27
+ 'a': KPipeline(model=models[False], lang_code='a', device='cpu'), # American English
28
+ 'b': KPipeline(model=models[False], lang_code='b', device='cpu') # British English
29
+ }
30
+
31
+ # Set custom pronunciations for "kokoro" in both American and British modes
32
+ try:
33
+ pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
34
+ pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"
35
+ except AttributeError as e:
36
+ logger.warning(f"Could not set custom pronunciations: {e}")
37
+
38
+ # Core functions for voice generation
39
+ def forward_gpu(text, voice_path, speed):
40
+ pipeline = pipelines[voice_path[0]]
41
+ pipeline.model = models[True] # Use GPU model
42
+ generator = pipeline(text, voice=voice_path, speed=speed)
43
+ for _, _, audio in generator:
44
+ return audio
45
+ return None
46
+
47
+ def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
48
+ voice_path = os.path.join(VOICE_DIR, voice)
49
+ if not os.path.exists(voice_path):
50
+ raise FileNotFoundError(f"Voice file not found: {voice_path}")
51
+
52
+ pipeline = pipelines[voice[0]]
53
+ use_gpu = use_gpu and CUDA_AVAILABLE
54
+ try:
55
+ if use_gpu:
56
+ audio = forward_gpu(text, voice_path, speed)
57
+ else:
58
+ pipeline.model = models[False]
59
+ generator = pipeline(text, voice=voice_path, speed=speed)
60
+ for _, ps, audio in generator:
61
+ return (24000, audio.numpy()), ps
62
+ except gr.exceptions.Error as e:
63
+ if use_gpu:
64
+ gr.Warning(str(e))
65
+ pipeline.model = models[False]
66
+ generator = pipeline(text, voice=voice_path, speed=speed)
67
+ for _, ps, audio in generator:
68
+ return (24000, audio.numpy()), ps
69
+ else:
70
+ raise gr.Error(e)
71
+ return None, ""
72
+
73
+ # Load available voices
74
+ def load_voice_choices():
75
+ if not os.path.exists(VOICE_DIR):
76
+ os.makedirs(VOICE_DIR)
77
+ voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
78
+ choices = {voice_file: voice_file for voice_file in voice_files}
79
+ return choices
80
+
81
+ CHOICES = load_voice_choices()
82
+
83
+ if not CHOICES:
84
+ logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
85
+ CHOICES = {"Bella": "af_bella.pt"}
86
+
87
+ TOKEN_NOTE = '''
88
+ 💡 Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkəɹO/)
89
+ ⬆️ Adjust stress levels using special notations.
90
+ '''
91
+
92
+ # Gradio Interface
93
+ with gr.Blocks() as app:
94
+ with gr.Row():
95
+ text = gr.Textbox(label="Input Text", value=TEXT)
96
+ voice = gr.Dropdown(list(CHOICES.values()), label="Voice", value=list(CHOICES.values())[0])
97
+ speed = gr.Slider(0.5, 2, value=1, label="Speed")
98
+ output_audio = gr.Audio(label="Output Audio", interactive=False)
99
+ generate_btn = gr.Button("Generate")
100
+
101
+ generate_btn.click(fn=generate_first, inputs=[text, voice, speed], outputs=[output_audio])
102
+
103
+ # Run the app
104
+ if __name__ == "__main__":
105
+ app.launch(server_name="0.0.0.0", server_port=7860)