Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
# Initalize a pipeline
|
3 |
from kokoro import KPipeline
|
4 |
# from IPython.display import display, Audio
|
@@ -6,24 +5,38 @@ from kokoro import KPipeline
|
|
6 |
import os
|
7 |
from huggingface_hub import list_repo_files
|
8 |
import uuid
|
9 |
-
import re
|
10 |
import gradio as gr
|
11 |
|
12 |
|
13 |
-
#translate langauge
|
14 |
from deep_translator import GoogleTranslator
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
# lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
|
28 |
lang_code=language_map_local[target_language]
|
29 |
sentences = re.split(r'(?<=[.!?])\s+', text) # Split text into sentences
|
@@ -43,7 +56,7 @@ def bulk_translate(text, target_language, chunk_size=500):
|
|
43 |
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
|
44 |
result=" ".join(translated_chunks)
|
45 |
return result.strip()
|
46 |
-
|
47 |
# Language mapping dictionary
|
48 |
language_map = {
|
49 |
"American English": "a",
|
@@ -67,7 +80,7 @@ def update_pipeline(Language):
|
|
67 |
# Only update if the language is different
|
68 |
if new_lang != last_used_language:
|
69 |
pipeline = KPipeline(lang_code=new_lang)
|
70 |
-
last_used_language = new_lang
|
71 |
try:
|
72 |
pipeline = KPipeline(lang_code=new_lang)
|
73 |
last_used_language = new_lang # Update last used language
|
@@ -125,7 +138,7 @@ def clean_text(text):
|
|
125 |
r'[\U00002702-\U000027B0]|' # Dingbats
|
126 |
r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS)
|
127 |
r'', flags=re.UNICODE)
|
128 |
-
|
129 |
text = emoji_pattern.sub(r'', text)
|
130 |
|
131 |
# Remove multiple spaces and extra line breaks
|
@@ -139,13 +152,13 @@ def tts_file_name(text,language):
|
|
139 |
text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces
|
140 |
text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
|
141 |
text = text.replace(" ", "_") # Replace spaces with underscores
|
142 |
-
language=language.replace(" ", "_").strip()
|
143 |
# Truncate or handle empty text
|
144 |
truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
|
145 |
-
|
146 |
# Generate a random string for uniqueness
|
147 |
random_string = uuid.uuid4().hex[:8].upper()
|
148 |
-
|
149 |
# Construct the file name
|
150 |
file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
|
151 |
return file_name
|
@@ -166,7 +179,7 @@ def remove_silence_function(file_path,minimum_silence=50):
|
|
166 |
audio_chunks = split_on_silence(sound,
|
167 |
min_silence_len=100,
|
168 |
silence_thresh=-45,
|
169 |
-
keep_silence=minimum_silence)
|
170 |
# Putting the file back together
|
171 |
combined = AudioSegment.empty()
|
172 |
for chunk in audio_chunks:
|
@@ -205,7 +218,7 @@ def generate_and_save_audio(text, Language="American English",voice="af_bella",
|
|
205 |
duration_sec = len(audio_np) / 24000
|
206 |
timestamps[i]["duration"] = duration_sec
|
207 |
wav_file.writeframes(audio_bytes)
|
208 |
-
if remove_silence:
|
209 |
keep_silence = int(keep_silence_up_to * 1000)
|
210 |
new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
|
211 |
return new_wave_file,timestamps
|
@@ -257,7 +270,7 @@ def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuati
|
|
257 |
|
258 |
for entry in word_level_timestamps:
|
259 |
word = entry["word"]
|
260 |
-
|
261 |
# Skip punctuation if enabled
|
262 |
if skip_punctuation and all(char in string.punctuation for char in word):
|
263 |
continue
|
@@ -320,13 +333,13 @@ def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_w
|
|
320 |
|
321 |
# Skip selected punctuation from remove_punctuation list
|
322 |
if word in remove_punctuation:
|
323 |
-
continue
|
324 |
|
325 |
# Attach punctuation to the previous word
|
326 |
if word in string.punctuation:
|
327 |
if subtitle_words:
|
328 |
subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
|
329 |
-
continue
|
330 |
|
331 |
# Start a new subtitle block if needed
|
332 |
if start_time is None:
|
@@ -383,16 +396,16 @@ import re
|
|
383 |
def fix_punctuation(text):
|
384 |
# Remove spaces before punctuation marks (., ?, !, ,)
|
385 |
text = re.sub(r'\s([.,?!])', r'\1', text)
|
386 |
-
|
387 |
# Handle quotation marks: remove spaces before and after them
|
388 |
text = text.replace('" ', '"')
|
389 |
text = text.replace(' "', '"')
|
390 |
text = text.replace('" ', '"')
|
391 |
-
|
392 |
# Track quotation marks to add space after closing quotes
|
393 |
track = 0
|
394 |
result = []
|
395 |
-
|
396 |
for index, char in enumerate(text):
|
397 |
if char == '"':
|
398 |
track += 1
|
@@ -495,10 +508,9 @@ def save_current_data():
|
|
495 |
if os.path.exists("./last"):
|
496 |
shutil.rmtree("./last")
|
497 |
os.makedirs("./last",exist_ok=True)
|
498 |
-
|
499 |
-
|
500 |
def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False,keep_silence_up_to=0.05):
|
501 |
-
if translate_text:
|
502 |
text=bulk_translate(text, Language, chunk_size=500)
|
503 |
save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
|
504 |
if remove_silence==False:
|
@@ -516,16 +528,15 @@ def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,t
|
|
516 |
shutil.copy(normal_srt, "./last/")
|
517 |
shutil.copy(json_file, "./last/")
|
518 |
return save_path,save_path,word_level_srt,normal_srt,json_file
|
519 |
-
return save_path,save_path,None,None,None
|
520 |
-
|
521 |
-
|
522 |
|
523 |
|
524 |
|
525 |
-
def
|
526 |
-
def toggle_autoplay(autoplay):
|
527 |
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
|
528 |
-
|
|
|
|
|
529 |
# Define examples in the format you mentioned
|
530 |
dummy_examples = [
|
531 |
["Hey, y'all, let’s grab some coffee and catch up!", "American English", "af_bella"],
|
@@ -538,17 +549,16 @@ def ui():
|
|
538 |
["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"],
|
539 |
["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"]
|
540 |
]
|
541 |
-
|
542 |
with gr.Blocks() as demo:
|
543 |
# gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>") # Larger title with CSS
|
544 |
-
gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
|
545 |
-
|
546 |
-
voice_names = get_voice_names("hexgrad/Kokoro-82M")
|
547 |
|
548 |
with gr.Row():
|
549 |
with gr.Column():
|
550 |
text = gr.Textbox(label='📝 Enter Text', lines=3)
|
551 |
-
|
552 |
with gr.Row():
|
553 |
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
|
554 |
|
@@ -588,7 +598,7 @@ def tutorial():
|
|
588 |
# Markdown explanation for language code
|
589 |
explanation = """
|
590 |
## Language Code Explanation:
|
591 |
-
Example: `'af_bella'`
|
592 |
- **'a'** stands for **American English**.
|
593 |
- **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
|
594 |
- **'bella'** refers to the specific voice.
|
@@ -609,11 +619,298 @@ def tutorial():
|
|
609 |
- **"m_"**: Male
|
610 |
"""
|
611 |
with gr.Blocks() as demo2:
|
612 |
-
gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
|
613 |
gr.Markdown(explanation) # Display the explanation
|
614 |
return demo2
|
615 |
|
616 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
617 |
|
618 |
import click
|
619 |
@click.command()
|
@@ -622,8 +919,9 @@ import click
|
|
622 |
def main(debug, share):
|
623 |
# def main(debug=True, share=True):
|
624 |
demo1 = ui()
|
625 |
-
demo2 =
|
626 |
-
|
|
|
627 |
demo.queue().launch(debug=debug, share=share)
|
628 |
# demo.queue().launch(debug=debug, share=share,server_port=9000)
|
629 |
#Run on local network
|
@@ -638,4 +936,4 @@ last_used_language = "a"
|
|
638 |
pipeline = KPipeline(lang_code=last_used_language)
|
639 |
temp_folder = create_audio_dir()
|
640 |
if __name__ == "__main__":
|
641 |
-
main()
|
|
|
|
|
1 |
# Initalize a pipeline
|
2 |
from kokoro import KPipeline
|
3 |
# from IPython.display import display, Audio
|
|
|
5 |
import os
|
6 |
from huggingface_hub import list_repo_files
|
7 |
import uuid
|
8 |
+
import re
|
9 |
import gradio as gr
|
10 |
|
11 |
|
12 |
+
#translate langauge
|
13 |
from deep_translator import GoogleTranslator
|
14 |
+
language_map_local = {
|
15 |
+
"American English": "en",
|
16 |
+
"British English": "en",
|
17 |
+
"Hindi": "hi",
|
18 |
+
"Spanish": "es",
|
19 |
+
"French": "fr",
|
20 |
+
"Italian": "it",
|
21 |
+
"Brazilian Portuguese": "pt",
|
22 |
+
"Japanese": "ja",
|
23 |
+
"Mandarin Chinese": "zh-CN"
|
24 |
+
}
|
25 |
+
def bulk_translate(text, target_language, chunk_size=500,MAX_ALLOWED_CHARACTERS = 10000):
|
26 |
+
if len(text)>=MAX_ALLOWED_CHARACTERS:
|
27 |
+
gr.Warning("[WARNING] Text too long — skipping translation to prevent Google Translate abuse.")
|
28 |
+
return text
|
29 |
+
# language_map_local = {
|
30 |
+
# "American English": "en",
|
31 |
+
# "British English": "en",
|
32 |
+
# "Hindi": "hi",
|
33 |
+
# "Spanish": "es",
|
34 |
+
# "French": "fr",
|
35 |
+
# "Italian": "it",
|
36 |
+
# "Brazilian Portuguese": "pt",
|
37 |
+
# "Japanese": "ja",
|
38 |
+
# "Mandarin Chinese": "zh-CN"
|
39 |
+
# }
|
40 |
# lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
|
41 |
lang_code=language_map_local[target_language]
|
42 |
sentences = re.split(r'(?<=[.!?])\s+', text) # Split text into sentences
|
|
|
56 |
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
|
57 |
result=" ".join(translated_chunks)
|
58 |
return result.strip()
|
59 |
+
|
60 |
# Language mapping dictionary
|
61 |
language_map = {
|
62 |
"American English": "a",
|
|
|
80 |
# Only update if the language is different
|
81 |
if new_lang != last_used_language:
|
82 |
pipeline = KPipeline(lang_code=new_lang)
|
83 |
+
last_used_language = new_lang
|
84 |
try:
|
85 |
pipeline = KPipeline(lang_code=new_lang)
|
86 |
last_used_language = new_lang # Update last used language
|
|
|
138 |
r'[\U00002702-\U000027B0]|' # Dingbats
|
139 |
r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS)
|
140 |
r'', flags=re.UNICODE)
|
141 |
+
|
142 |
text = emoji_pattern.sub(r'', text)
|
143 |
|
144 |
# Remove multiple spaces and extra line breaks
|
|
|
152 |
text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces
|
153 |
text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
|
154 |
text = text.replace(" ", "_") # Replace spaces with underscores
|
155 |
+
language=language.replace(" ", "_").strip()
|
156 |
# Truncate or handle empty text
|
157 |
truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
|
158 |
+
|
159 |
# Generate a random string for uniqueness
|
160 |
random_string = uuid.uuid4().hex[:8].upper()
|
161 |
+
|
162 |
# Construct the file name
|
163 |
file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
|
164 |
return file_name
|
|
|
179 |
audio_chunks = split_on_silence(sound,
|
180 |
min_silence_len=100,
|
181 |
silence_thresh=-45,
|
182 |
+
keep_silence=minimum_silence)
|
183 |
# Putting the file back together
|
184 |
combined = AudioSegment.empty()
|
185 |
for chunk in audio_chunks:
|
|
|
218 |
duration_sec = len(audio_np) / 24000
|
219 |
timestamps[i]["duration"] = duration_sec
|
220 |
wav_file.writeframes(audio_bytes)
|
221 |
+
if remove_silence:
|
222 |
keep_silence = int(keep_silence_up_to * 1000)
|
223 |
new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
|
224 |
return new_wave_file,timestamps
|
|
|
270 |
|
271 |
for entry in word_level_timestamps:
|
272 |
word = entry["word"]
|
273 |
+
|
274 |
# Skip punctuation if enabled
|
275 |
if skip_punctuation and all(char in string.punctuation for char in word):
|
276 |
continue
|
|
|
333 |
|
334 |
# Skip selected punctuation from remove_punctuation list
|
335 |
if word in remove_punctuation:
|
336 |
+
continue
|
337 |
|
338 |
# Attach punctuation to the previous word
|
339 |
if word in string.punctuation:
|
340 |
if subtitle_words:
|
341 |
subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
|
342 |
+
continue
|
343 |
|
344 |
# Start a new subtitle block if needed
|
345 |
if start_time is None:
|
|
|
396 |
def fix_punctuation(text):
|
397 |
# Remove spaces before punctuation marks (., ?, !, ,)
|
398 |
text = re.sub(r'\s([.,?!])', r'\1', text)
|
399 |
+
|
400 |
# Handle quotation marks: remove spaces before and after them
|
401 |
text = text.replace('" ', '"')
|
402 |
text = text.replace(' "', '"')
|
403 |
text = text.replace('" ', '"')
|
404 |
+
|
405 |
# Track quotation marks to add space after closing quotes
|
406 |
track = 0
|
407 |
result = []
|
408 |
+
|
409 |
for index, char in enumerate(text):
|
410 |
if char == '"':
|
411 |
track += 1
|
|
|
508 |
if os.path.exists("./last"):
|
509 |
shutil.rmtree("./last")
|
510 |
os.makedirs("./last",exist_ok=True)
|
511 |
+
|
|
|
512 |
def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False,keep_silence_up_to=0.05):
|
513 |
+
if translate_text:
|
514 |
text=bulk_translate(text, Language, chunk_size=500)
|
515 |
save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
|
516 |
if remove_silence==False:
|
|
|
528 |
shutil.copy(normal_srt, "./last/")
|
529 |
shutil.copy(json_file, "./last/")
|
530 |
return save_path,save_path,word_level_srt,normal_srt,json_file
|
531 |
+
return save_path,save_path,None,None,None
|
|
|
|
|
532 |
|
533 |
|
534 |
|
535 |
+
def toggle_autoplay(autoplay):
|
|
|
536 |
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
|
537 |
+
lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese']
|
538 |
+
voice_names = get_voice_names("hexgrad/Kokoro-82M")
|
539 |
+
def ui():
|
540 |
# Define examples in the format you mentioned
|
541 |
dummy_examples = [
|
542 |
["Hey, y'all, let’s grab some coffee and catch up!", "American English", "af_bella"],
|
|
|
549 |
["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"],
|
550 |
["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"]
|
551 |
]
|
552 |
+
|
553 |
with gr.Blocks() as demo:
|
554 |
# gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>") # Larger title with CSS
|
555 |
+
# gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
|
556 |
+
|
|
|
557 |
|
558 |
with gr.Row():
|
559 |
with gr.Column():
|
560 |
text = gr.Textbox(label='📝 Enter Text', lines=3)
|
561 |
+
|
562 |
with gr.Row():
|
563 |
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
|
564 |
|
|
|
598 |
# Markdown explanation for language code
|
599 |
explanation = """
|
600 |
## Language Code Explanation:
|
601 |
+
Example: `'af_bella'`
|
602 |
- **'a'** stands for **American English**.
|
603 |
- **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
|
604 |
- **'bella'** refers to the specific voice.
|
|
|
619 |
- **"m_"**: Male
|
620 |
"""
|
621 |
with gr.Blocks() as demo2:
|
622 |
+
# gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
|
623 |
gr.Markdown(explanation) # Display the explanation
|
624 |
return demo2
|
625 |
|
626 |
|
627 |
+
#@title subtitle
|
628 |
+
import os
|
629 |
+
import re
|
630 |
+
import uuid
|
631 |
+
import shutil
|
632 |
+
import platform
|
633 |
+
import datetime
|
634 |
+
import subprocess
|
635 |
+
|
636 |
+
import pysrt
|
637 |
+
import librosa
|
638 |
+
import soundfile as sf
|
639 |
+
from tqdm.auto import tqdm
|
640 |
+
from pydub import AudioSegment
|
641 |
+
from deep_translator import GoogleTranslator
|
642 |
+
|
643 |
+
|
644 |
+
# ---------------------- Utility Functions ----------------------
|
645 |
+
def get_current_time():
|
646 |
+
return datetime.datetime.now().strftime("%I_%M_%p")
|
647 |
+
|
648 |
+
def get_subtitle_Dub_path(srt_file_path, Language):
|
649 |
+
file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
|
650 |
+
full_base_path = os.path.join(os.getcwd(), "TTS_DUB")
|
651 |
+
os.makedirs(full_base_path, exist_ok=True)
|
652 |
+
random_string = str(uuid.uuid4())[:6]
|
653 |
+
lang = language_map_local.get(Language, Language.replace(" ", "_"))
|
654 |
+
new_path = os.path.join(full_base_path, f"{file_name}_{lang}_{random_string}.wav")
|
655 |
+
return new_path.replace("__", "_")
|
656 |
+
|
657 |
+
def clean_srt(input_path):
|
658 |
+
def clean_srt_line(text):
|
659 |
+
for bad in ["[", "]", "♫"]:
|
660 |
+
text = text.replace(bad, "")
|
661 |
+
return text.strip()
|
662 |
+
|
663 |
+
subs = pysrt.open(input_path, encoding='utf-8')
|
664 |
+
output_path = input_path.lower().replace(".srt", "") + "_.srt"
|
665 |
+
with open(output_path, "w", encoding='utf-8') as file:
|
666 |
+
for sub in subs:
|
667 |
+
file.write(f"{sub.index}\n{sub.start} --> {sub.end}\n{clean_srt_line(sub.text)}\n\n")
|
668 |
+
return output_path
|
669 |
+
|
670 |
+
def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_size=4000):
|
671 |
+
output_path = input_path.replace(".srt", f"{target_language}.srt")
|
672 |
+
subs = pysrt.open(input_path, encoding='utf-8')
|
673 |
+
if len(subs) > max_segments:
|
674 |
+
gr.Warning(f"Too many segments: {len(subs)} > {max_segments}. Skipping translation.")
|
675 |
+
return input_path
|
676 |
+
|
677 |
+
original = [f"<#{i}>{s.text}" for i, s in enumerate(subs)]
|
678 |
+
full_text = "\n".join(original)
|
679 |
+
|
680 |
+
chunks, start = [], 0
|
681 |
+
while start < len(full_text):
|
682 |
+
end = start + chunk_size
|
683 |
+
split_point = full_text.rfind("<#", start, end) if end < len(full_text) else len(full_text)
|
684 |
+
chunks.append(full_text[start:split_point])
|
685 |
+
start = split_point
|
686 |
+
|
687 |
+
lang_code = language_map_local.get(target_language, "en")
|
688 |
+
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
|
689 |
+
translated_text = "\n".join(translated_chunks)
|
690 |
+
|
691 |
+
pattern = re.compile(r"<#(\d+)>(.*?)(?=<#\d+>|$)", re.DOTALL)
|
692 |
+
translated_dict = {int(i): txt.strip() for i, txt in pattern.findall(translated_text)}
|
693 |
+
|
694 |
+
for i, sub in enumerate(subs):
|
695 |
+
sub.text = translated_dict.get(i, sub.text)
|
696 |
+
|
697 |
+
subs.save(output_path, encoding='utf-8')
|
698 |
+
return output_path
|
699 |
+
|
700 |
+
def prepare_srt(srt_path, target_language, translate=False):
|
701 |
+
path = clean_srt(srt_path)
|
702 |
+
return translate_srt(path, target_language) if translate else path
|
703 |
+
|
704 |
+
|
705 |
+
def is_ffmpeg_installed():
|
706 |
+
ffmpeg_exe = "ffmpeg.exe" if platform.system() == "Windows" else "ffmpeg"
|
707 |
+
try:
|
708 |
+
subprocess.run([ffmpeg_exe, "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
709 |
+
return True, ffmpeg_exe
|
710 |
+
except Exception:
|
711 |
+
gr.Warning("FFmpeg not found. Falling back to librosa for audio speedup.", duration=20)
|
712 |
+
return False, ffmpeg_exe
|
713 |
+
|
714 |
+
def speedup_audio_librosa(input_file, output_file, speedup_factor):
|
715 |
+
try:
|
716 |
+
y, sr = librosa.load(input_file, sr=None)
|
717 |
+
y_stretched = librosa.effects.time_stretch(y, rate=speedup_factor)
|
718 |
+
sf.write(output_file, y_stretched, sr)
|
719 |
+
except Exception as e:
|
720 |
+
gr.Warning(f"Librosa speedup failed: {e}")
|
721 |
+
shutil.copy(input_file, output_file)
|
722 |
+
|
723 |
+
def change_speed(input_file, output_file, speedup_factor, use_ffmpeg, ffmpeg_path):
|
724 |
+
if use_ffmpeg:
|
725 |
+
try:
|
726 |
+
subprocess.run([ffmpeg_path, "-i", input_file, "-filter:a", f"atempo={speedup_factor}", output_file, "-y"], check=True)
|
727 |
+
except Exception as e:
|
728 |
+
gr.Error(f"FFmpeg speedup error: {e}")
|
729 |
+
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
730 |
+
else:
|
731 |
+
speedup_audio_librosa(input_file, output_file, speedup_factor)
|
732 |
+
|
733 |
+
def remove_edge_silence(input_path, output_path):
|
734 |
+
y, sr = librosa.load(input_path, sr=None)
|
735 |
+
trimmed_audio, _ = librosa.effects.trim(y, top_db=30)
|
736 |
+
sf.write(output_path, trimmed_audio, sr)
|
737 |
+
return output_path
|
738 |
+
|
739 |
+
|
740 |
+
# ---------------------- Main Class ----------------------
|
741 |
+
class SRTDubbing:
|
742 |
+
def __init__(self, use_ffmpeg=True, ffmpeg_path="ffmpeg"):
|
743 |
+
self.use_ffmpeg = use_ffmpeg
|
744 |
+
self.ffmpeg_path = ffmpeg_path
|
745 |
+
self.cache_dir = "./cache"
|
746 |
+
os.makedirs("./dummy", exist_ok=True)
|
747 |
+
os.makedirs(self.cache_dir, exist_ok=True)
|
748 |
+
|
749 |
+
@staticmethod
|
750 |
+
def convert_to_millisecond(t):
|
751 |
+
return t.hours * 3600000 + t.minutes * 60000 + t.seconds * 1000 + int(t.milliseconds)
|
752 |
+
|
753 |
+
@staticmethod
|
754 |
+
def read_srt_file(file_path):
|
755 |
+
subs = pysrt.open(file_path, encoding='utf-8')
|
756 |
+
entries = []
|
757 |
+
prev_end = 0
|
758 |
+
for idx, sub in enumerate(subs, 1):
|
759 |
+
start, end = SRTDubbing.convert_to_millisecond(sub.start), SRTDubbing.convert_to_millisecond(sub.end)
|
760 |
+
pause = start - prev_end if idx > 1 else start
|
761 |
+
entries.append({
|
762 |
+
'entry_number': idx,
|
763 |
+
'start_time': start,
|
764 |
+
'end_time': end,
|
765 |
+
'text': sub.text.strip(),
|
766 |
+
'pause_time': pause,
|
767 |
+
'audio_name': f"{idx}.wav",
|
768 |
+
'previous_pause': f"{idx}_before_pause.wav",
|
769 |
+
})
|
770 |
+
prev_end = end
|
771 |
+
return entries
|
772 |
+
|
773 |
+
def text_to_speech_srt(self, text, audio_path, language, voice, actual_duration):
|
774 |
+
temp = "./cache/temp.wav"
|
775 |
+
# Step 1: Generate initial audio
|
776 |
+
path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1, remove_silence=False, keep_silence_up_to=0.05)
|
777 |
+
# ✂️ Remove leading and trailing silence to make timing tight without trimming actual speech.
|
778 |
+
remove_edge_silence(path, temp)
|
779 |
+
# 📏 Load the trimmed audio and get its duration in milliseconds.
|
780 |
+
audio = AudioSegment.from_file(temp)
|
781 |
+
|
782 |
+
# ⏱️ If no duration is specified (edge case), use the TTS as-is without speed/timing adjustments.
|
783 |
+
if actual_duration == 0:
|
784 |
+
shutil.move(temp, audio_path)
|
785 |
+
return
|
786 |
+
|
787 |
+
# Step 2: If TTS audio is longer, retry with remove_silence=True
|
788 |
+
if len(audio) > actual_duration:
|
789 |
+
path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1, remove_silence=True, keep_silence_up_to=0.05)
|
790 |
+
remove_edge_silence(path, temp)
|
791 |
+
audio = AudioSegment.from_file(temp)
|
792 |
+
|
793 |
+
# Step 3: If still longer → speed up
|
794 |
+
if len(audio) > actual_duration:
|
795 |
+
factor = len(audio) / actual_duration
|
796 |
+
path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=factor, remove_silence=True, keep_silence_up_to=0.05)
|
797 |
+
remove_edge_silence(path, temp)
|
798 |
+
audio = AudioSegment.from_file(temp)
|
799 |
+
|
800 |
+
# Final Adjustment: Speed up via FFmpeg or librosa
|
801 |
+
if len(audio) > actual_duration:
|
802 |
+
factor = len(audio) / actual_duration
|
803 |
+
final_temp = "./cache/speedup_temp.wav"
|
804 |
+
change_speed(temp, final_temp, factor, self.use_ffmpeg, self.ffmpeg_path)
|
805 |
+
shutil.move(final_temp, audio_path)
|
806 |
+
|
807 |
+
# Add silence if too short
|
808 |
+
elif len(audio) < actual_duration:
|
809 |
+
silence = AudioSegment.silent(duration=actual_duration - len(audio))
|
810 |
+
(audio + silence).export(audio_path, format="wav")
|
811 |
+
# ➡️ Fallback: If TTS already perfectly matches subtitle duration, save as-is.
|
812 |
+
else:
|
813 |
+
shutil.move(temp, audio_path) #bad code
|
814 |
+
|
815 |
+
@staticmethod
|
816 |
+
def make_silence(duration, path):
|
817 |
+
AudioSegment.silent(duration=duration).export(path, format="wav")
|
818 |
+
|
819 |
+
@staticmethod
|
820 |
+
def create_folder_for_srt(srt_file_path):
|
821 |
+
base = os.path.splitext(os.path.basename(srt_file_path))[0]
|
822 |
+
folder = f"./dummy/{base}_{str(uuid.uuid4())[:4]}"
|
823 |
+
os.makedirs(folder, exist_ok=True)
|
824 |
+
return folder
|
825 |
+
|
826 |
+
@staticmethod
|
827 |
+
def concatenate_audio_files(paths, output):
|
828 |
+
audio = sum([AudioSegment.from_file(p) for p in paths], AudioSegment.silent(duration=0))
|
829 |
+
audio.export(output, format="wav")
|
830 |
+
|
831 |
+
def srt_to_dub(self, srt_path, output_path, language, voice):
|
832 |
+
entries = self.read_srt_file(srt_path)
|
833 |
+
folder = self.create_folder_for_srt(srt_path)
|
834 |
+
all_audio = []
|
835 |
+
for entry in tqdm(entries):
|
836 |
+
self.make_silence(entry['pause_time'], os.path.join(folder, entry['previous_pause']))
|
837 |
+
all_audio.append(os.path.join(folder, entry['previous_pause']))
|
838 |
+
|
839 |
+
tts_path = os.path.join(folder, entry['audio_name'])
|
840 |
+
self.text_to_speech_srt(entry['text'], tts_path, language, voice, entry['end_time'] - entry['start_time'])
|
841 |
+
all_audio.append(tts_path)
|
842 |
+
|
843 |
+
self.concatenate_audio_files(all_audio, output_path)
|
844 |
+
|
845 |
+
|
846 |
+
# ---------------------- Entrypoint ----------------------
|
847 |
+
def srt_process(srt_path, Language="American English", voice_name="af_bella", translate=False):
|
848 |
+
if not srt_path.endswith(".srt"):
|
849 |
+
gr.Error("Please upload a valid .srt file", duration=5)
|
850 |
+
return None
|
851 |
+
|
852 |
+
use_ffmpeg, ffmpeg_path = is_ffmpeg_installed()
|
853 |
+
processed_srt = prepare_srt(srt_path, Language, translate)
|
854 |
+
output_path = get_subtitle_Dub_path(srt_path, Language)
|
855 |
+
|
856 |
+
SRTDubbing(use_ffmpeg, ffmpeg_path).srt_to_dub(processed_srt, output_path, Language, voice_name)
|
857 |
+
return output_path
|
858 |
+
|
859 |
+
def subtitle_ui():
|
860 |
+
with gr.Blocks() as demo:
|
861 |
+
|
862 |
+
gr.Markdown(
|
863 |
+
"""
|
864 |
+
# Generate Audio File From Subtitle [Upload Only .srt file]
|
865 |
+
|
866 |
+
To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
|
867 |
+
|
868 |
+
"""
|
869 |
+
)
|
870 |
+
with gr.Row():
|
871 |
+
with gr.Column():
|
872 |
+
srt_file = gr.File(label='Upload .srt Subtitle File Only')
|
873 |
+
with gr.Row():
|
874 |
+
language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
|
875 |
+
with gr.Row():
|
876 |
+
voice = gr.Dropdown(
|
877 |
+
voice_names,
|
878 |
+
value='af_bella',
|
879 |
+
allow_custom_value=False,
|
880 |
+
label='🎙️ Choose VoicePack',
|
881 |
+
)
|
882 |
+
with gr.Row():
|
883 |
+
generate_btn_ = gr.Button('Generate', variant='primary')
|
884 |
+
|
885 |
+
with gr.Accordion('Other Settings', open=False):
|
886 |
+
translate_text = gr.Checkbox(value=False, label='🌐 Translate Text to Selected Language')
|
887 |
+
|
888 |
+
|
889 |
+
|
890 |
+
with gr.Column():
|
891 |
+
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
892 |
+
with gr.Accordion('Enable Autoplay', open=False):
|
893 |
+
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
894 |
+
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
895 |
+
|
896 |
+
# srt_file.submit(
|
897 |
+
# srt_process,
|
898 |
+
# inputs=[srt_file, voice],
|
899 |
+
# outputs=[audio]
|
900 |
+
# )
|
901 |
+
generate_btn_.click(
|
902 |
+
srt_process,
|
903 |
+
inputs=[srt_file,language_name,voice,translate_text],
|
904 |
+
outputs=[audio]
|
905 |
+
)
|
906 |
+
return demo
|
907 |
+
|
908 |
+
|
909 |
+
|
910 |
+
# Example usage:
|
911 |
+
# srt_file_path = "/content/me.srt"
|
912 |
+
# dub_audio_path = srt_process(srt_file_path, Language="American English", voice_name="af_bella", translate=False)
|
913 |
+
# print(f"Audio file saved at: {dub_audio_path}")
|
914 |
|
915 |
import click
|
916 |
@click.command()
|
|
|
919 |
def main(debug, share):
|
920 |
# def main(debug=True, share=True):
|
921 |
demo1 = ui()
|
922 |
+
demo2 = subtitle_ui()
|
923 |
+
demo3 = tutorial()
|
924 |
+
demo = gr.TabbedInterface([demo1, demo2,demo3],["Multilingual TTS","SRT Dubbing","VoicePack Explanation"],title="Kokoro TTS")#,theme='JohnSmith9982/small_and_pretty')
|
925 |
demo.queue().launch(debug=debug, share=share)
|
926 |
# demo.queue().launch(debug=debug, share=share,server_port=9000)
|
927 |
#Run on local network
|
|
|
936 |
pipeline = KPipeline(lang_code=last_used_language)
|
937 |
temp_folder = create_audio_dir()
|
938 |
if __name__ == "__main__":
|
939 |
+
main()
|