NeuralFalcon commited on
Commit
7d04582
·
verified ·
1 Parent(s): 5d51aed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +346 -48
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  # Initalize a pipeline
3
  from kokoro import KPipeline
4
  # from IPython.display import display, Audio
@@ -6,24 +5,38 @@ from kokoro import KPipeline
6
  import os
7
  from huggingface_hub import list_repo_files
8
  import uuid
9
- import re
10
  import gradio as gr
11
 
12
 
13
- #translate langauge
14
  from deep_translator import GoogleTranslator
15
- def bulk_translate(text, target_language, chunk_size=500):
16
- language_map_local = {
17
- "American English": "en",
18
- "British English": "en",
19
- "Hindi": "hi",
20
- "Spanish": "es",
21
- "French": "fr",
22
- "Italian": "it",
23
- "Brazilian Portuguese": "pt",
24
- "Japanese": "ja",
25
- "Mandarin Chinese": "zh-CN"
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
28
  lang_code=language_map_local[target_language]
29
  sentences = re.split(r'(?<=[.!?])\s+', text) # Split text into sentences
@@ -43,7 +56,7 @@ def bulk_translate(text, target_language, chunk_size=500):
43
  translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
44
  result=" ".join(translated_chunks)
45
  return result.strip()
46
-
47
  # Language mapping dictionary
48
  language_map = {
49
  "American English": "a",
@@ -67,7 +80,7 @@ def update_pipeline(Language):
67
  # Only update if the language is different
68
  if new_lang != last_used_language:
69
  pipeline = KPipeline(lang_code=new_lang)
70
- last_used_language = new_lang
71
  try:
72
  pipeline = KPipeline(lang_code=new_lang)
73
  last_used_language = new_lang # Update last used language
@@ -125,7 +138,7 @@ def clean_text(text):
125
  r'[\U00002702-\U000027B0]|' # Dingbats
126
  r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS)
127
  r'', flags=re.UNICODE)
128
-
129
  text = emoji_pattern.sub(r'', text)
130
 
131
  # Remove multiple spaces and extra line breaks
@@ -139,13 +152,13 @@ def tts_file_name(text,language):
139
  text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces
140
  text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
141
  text = text.replace(" ", "_") # Replace spaces with underscores
142
- language=language.replace(" ", "_").strip()
143
  # Truncate or handle empty text
144
  truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
145
-
146
  # Generate a random string for uniqueness
147
  random_string = uuid.uuid4().hex[:8].upper()
148
-
149
  # Construct the file name
150
  file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
151
  return file_name
@@ -166,7 +179,7 @@ def remove_silence_function(file_path,minimum_silence=50):
166
  audio_chunks = split_on_silence(sound,
167
  min_silence_len=100,
168
  silence_thresh=-45,
169
- keep_silence=minimum_silence)
170
  # Putting the file back together
171
  combined = AudioSegment.empty()
172
  for chunk in audio_chunks:
@@ -205,7 +218,7 @@ def generate_and_save_audio(text, Language="American English",voice="af_bella",
205
  duration_sec = len(audio_np) / 24000
206
  timestamps[i]["duration"] = duration_sec
207
  wav_file.writeframes(audio_bytes)
208
- if remove_silence:
209
  keep_silence = int(keep_silence_up_to * 1000)
210
  new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
211
  return new_wave_file,timestamps
@@ -257,7 +270,7 @@ def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuati
257
 
258
  for entry in word_level_timestamps:
259
  word = entry["word"]
260
-
261
  # Skip punctuation if enabled
262
  if skip_punctuation and all(char in string.punctuation for char in word):
263
  continue
@@ -320,13 +333,13 @@ def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_w
320
 
321
  # Skip selected punctuation from remove_punctuation list
322
  if word in remove_punctuation:
323
- continue
324
 
325
  # Attach punctuation to the previous word
326
  if word in string.punctuation:
327
  if subtitle_words:
328
  subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
329
- continue
330
 
331
  # Start a new subtitle block if needed
332
  if start_time is None:
@@ -383,16 +396,16 @@ import re
383
  def fix_punctuation(text):
384
  # Remove spaces before punctuation marks (., ?, !, ,)
385
  text = re.sub(r'\s([.,?!])', r'\1', text)
386
-
387
  # Handle quotation marks: remove spaces before and after them
388
  text = text.replace('" ', '"')
389
  text = text.replace(' "', '"')
390
  text = text.replace('" ', '"')
391
-
392
  # Track quotation marks to add space after closing quotes
393
  track = 0
394
  result = []
395
-
396
  for index, char in enumerate(text):
397
  if char == '"':
398
  track += 1
@@ -495,10 +508,9 @@ def save_current_data():
495
  if os.path.exists("./last"):
496
  shutil.rmtree("./last")
497
  os.makedirs("./last",exist_ok=True)
498
-
499
-
500
  def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False,keep_silence_up_to=0.05):
501
- if translate_text:
502
  text=bulk_translate(text, Language, chunk_size=500)
503
  save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
504
  if remove_silence==False:
@@ -516,16 +528,15 @@ def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,t
516
  shutil.copy(normal_srt, "./last/")
517
  shutil.copy(json_file, "./last/")
518
  return save_path,save_path,word_level_srt,normal_srt,json_file
519
- return save_path,save_path,None,None,None
520
-
521
-
522
 
523
 
524
 
525
- def ui():
526
- def toggle_autoplay(autoplay):
527
  return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
528
-
 
 
529
  # Define examples in the format you mentioned
530
  dummy_examples = [
531
  ["Hey, y'all, let’s grab some coffee and catch up!", "American English", "af_bella"],
@@ -538,17 +549,16 @@ def ui():
538
  ["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"],
539
  ["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"]
540
  ]
541
-
542
  with gr.Blocks() as demo:
543
  # gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>") # Larger title with CSS
544
- gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
545
- lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese']
546
- voice_names = get_voice_names("hexgrad/Kokoro-82M")
547
 
548
  with gr.Row():
549
  with gr.Column():
550
  text = gr.Textbox(label='📝 Enter Text', lines=3)
551
-
552
  with gr.Row():
553
  language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
554
 
@@ -588,7 +598,7 @@ def tutorial():
588
  # Markdown explanation for language code
589
  explanation = """
590
  ## Language Code Explanation:
591
- Example: `'af_bella'`
592
  - **'a'** stands for **American English**.
593
  - **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
594
  - **'bella'** refers to the specific voice.
@@ -609,11 +619,298 @@ def tutorial():
609
  - **"m_"**: Male
610
  """
611
  with gr.Blocks() as demo2:
612
- gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
613
  gr.Markdown(explanation) # Display the explanation
614
  return demo2
615
 
616
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
 
618
  import click
619
  @click.command()
@@ -622,8 +919,9 @@ import click
622
  def main(debug, share):
623
  # def main(debug=True, share=True):
624
  demo1 = ui()
625
- demo2 = tutorial()
626
- demo = gr.TabbedInterface([demo1, demo2],["Multilingual TTS","VoicePack Explanation"],title="Kokoro TTS")#,theme='JohnSmith9982/small_and_pretty')
 
627
  demo.queue().launch(debug=debug, share=share)
628
  # demo.queue().launch(debug=debug, share=share,server_port=9000)
629
  #Run on local network
@@ -638,4 +936,4 @@ last_used_language = "a"
638
  pipeline = KPipeline(lang_code=last_used_language)
639
  temp_folder = create_audio_dir()
640
  if __name__ == "__main__":
641
- main()
 
 
1
  # Initalize a pipeline
2
  from kokoro import KPipeline
3
  # from IPython.display import display, Audio
 
5
  import os
6
  from huggingface_hub import list_repo_files
7
  import uuid
8
+ import re
9
  import gradio as gr
10
 
11
 
12
+ #translate langauge
13
  from deep_translator import GoogleTranslator
14
+ language_map_local = {
15
+ "American English": "en",
16
+ "British English": "en",
17
+ "Hindi": "hi",
18
+ "Spanish": "es",
19
+ "French": "fr",
20
+ "Italian": "it",
21
+ "Brazilian Portuguese": "pt",
22
+ "Japanese": "ja",
23
+ "Mandarin Chinese": "zh-CN"
24
+ }
25
+ def bulk_translate(text, target_language, chunk_size=500,MAX_ALLOWED_CHARACTERS = 10000):
26
+ if len(text)>=MAX_ALLOWED_CHARACTERS:
27
+ gr.Warning("[WARNING] Text too long — skipping translation to prevent Google Translate abuse.")
28
+ return text
29
+ # language_map_local = {
30
+ # "American English": "en",
31
+ # "British English": "en",
32
+ # "Hindi": "hi",
33
+ # "Spanish": "es",
34
+ # "French": "fr",
35
+ # "Italian": "it",
36
+ # "Brazilian Portuguese": "pt",
37
+ # "Japanese": "ja",
38
+ # "Mandarin Chinese": "zh-CN"
39
+ # }
40
  # lang_code = GoogleTranslator().get_supported_languages(as_dict=True).get(target_language.lower())
41
  lang_code=language_map_local[target_language]
42
  sentences = re.split(r'(?<=[.!?])\s+', text) # Split text into sentences
 
56
  translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
57
  result=" ".join(translated_chunks)
58
  return result.strip()
59
+
60
  # Language mapping dictionary
61
  language_map = {
62
  "American English": "a",
 
80
  # Only update if the language is different
81
  if new_lang != last_used_language:
82
  pipeline = KPipeline(lang_code=new_lang)
83
+ last_used_language = new_lang
84
  try:
85
  pipeline = KPipeline(lang_code=new_lang)
86
  last_used_language = new_lang # Update last used language
 
138
  r'[\U00002702-\U000027B0]|' # Dingbats
139
  r'[\U0001F1E0-\U0001F1FF]' # Flags (iOS)
140
  r'', flags=re.UNICODE)
141
+
142
  text = emoji_pattern.sub(r'', text)
143
 
144
  # Remove multiple spaces and extra line breaks
 
152
  text = re.sub(r'[^a-zA-Z\s]', '', text) # Retain only alphabets and spaces
153
  text = text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
154
  text = text.replace(" ", "_") # Replace spaces with underscores
155
+ language=language.replace(" ", "_").strip()
156
  # Truncate or handle empty text
157
  truncated_text = text[:20] if len(text) > 20 else text if len(text) > 0 else language
158
+
159
  # Generate a random string for uniqueness
160
  random_string = uuid.uuid4().hex[:8].upper()
161
+
162
  # Construct the file name
163
  file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
164
  return file_name
 
179
  audio_chunks = split_on_silence(sound,
180
  min_silence_len=100,
181
  silence_thresh=-45,
182
+ keep_silence=minimum_silence)
183
  # Putting the file back together
184
  combined = AudioSegment.empty()
185
  for chunk in audio_chunks:
 
218
  duration_sec = len(audio_np) / 24000
219
  timestamps[i]["duration"] = duration_sec
220
  wav_file.writeframes(audio_bytes)
221
+ if remove_silence:
222
  keep_silence = int(keep_silence_up_to * 1000)
223
  new_wave_file=remove_silence_function(save_path,minimum_silence=keep_silence)
224
  return new_wave_file,timestamps
 
270
 
271
  for entry in word_level_timestamps:
272
  word = entry["word"]
273
+
274
  # Skip punctuation if enabled
275
  if skip_punctuation and all(char in string.punctuation for char in word):
276
  continue
 
333
 
334
  # Skip selected punctuation from remove_punctuation list
335
  if word in remove_punctuation:
336
+ continue
337
 
338
  # Attach punctuation to the previous word
339
  if word in string.punctuation:
340
  if subtitle_words:
341
  subtitle_words[-1] = (subtitle_words[-1][0] + word, subtitle_words[-1][1])
342
+ continue
343
 
344
  # Start a new subtitle block if needed
345
  if start_time is None:
 
396
  def fix_punctuation(text):
397
  # Remove spaces before punctuation marks (., ?, !, ,)
398
  text = re.sub(r'\s([.,?!])', r'\1', text)
399
+
400
  # Handle quotation marks: remove spaces before and after them
401
  text = text.replace('" ', '"')
402
  text = text.replace(' "', '"')
403
  text = text.replace('" ', '"')
404
+
405
  # Track quotation marks to add space after closing quotes
406
  track = 0
407
  result = []
408
+
409
  for index, char in enumerate(text):
410
  if char == '"':
411
  track += 1
 
508
  if os.path.exists("./last"):
509
  shutil.rmtree("./last")
510
  os.makedirs("./last",exist_ok=True)
511
+
 
512
  def KOKORO_TTS_API(text, Language="American English",voice="af_bella", speed=1,translate_text=False,remove_silence=False,keep_silence_up_to=0.05):
513
+ if translate_text:
514
  text=bulk_translate(text, Language, chunk_size=500)
515
  save_path,timestamps=generate_and_save_audio(text=text, Language=Language,voice=voice, speed=speed,remove_silence=remove_silence,keep_silence_up_to=keep_silence_up_to)
516
  if remove_silence==False:
 
528
  shutil.copy(normal_srt, "./last/")
529
  shutil.copy(json_file, "./last/")
530
  return save_path,save_path,word_level_srt,normal_srt,json_file
531
+ return save_path,save_path,None,None,None
 
 
532
 
533
 
534
 
535
+ def toggle_autoplay(autoplay):
 
536
  return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
537
+ lang_list = ['American English', 'British English', 'Hindi', 'Spanish', 'French', 'Italian', 'Brazilian Portuguese', 'Japanese', 'Mandarin Chinese']
538
+ voice_names = get_voice_names("hexgrad/Kokoro-82M")
539
+ def ui():
540
  # Define examples in the format you mentioned
541
  dummy_examples = [
542
  ["Hey, y'all, let’s grab some coffee and catch up!", "American English", "af_bella"],
 
549
  ["こんにちは、お元気ですか?", "Japanese", "jf_nezumi"],
550
  ["你好,你怎么样?", "Mandarin Chinese", "zf_xiaoni"]
551
  ]
552
+
553
  with gr.Blocks() as demo:
554
  # gr.Markdown("<center><h1 style='font-size: 40px;'>KOKORO TTS</h1></center>") # Larger title with CSS
555
+ # gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
556
+
 
557
 
558
  with gr.Row():
559
  with gr.Column():
560
  text = gr.Textbox(label='📝 Enter Text', lines=3)
561
+
562
  with gr.Row():
563
  language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
564
 
 
598
  # Markdown explanation for language code
599
  explanation = """
600
  ## Language Code Explanation:
601
+ Example: `'af_bella'`
602
  - **'a'** stands for **American English**.
603
  - **'f_'** stands for **Female** (If it were 'm_', it would mean Male).
604
  - **'bella'** refers to the specific voice.
 
619
  - **"m_"**: Male
620
  """
621
  with gr.Blocks() as demo2:
622
+ # gr.Markdown("[Install on Your Local System](https://github.com/NeuralFalconYT/kokoro_v1)")
623
  gr.Markdown(explanation) # Display the explanation
624
  return demo2
625
 
626
 
627
+ #@title subtitle
628
+ import os
629
+ import re
630
+ import uuid
631
+ import shutil
632
+ import platform
633
+ import datetime
634
+ import subprocess
635
+
636
+ import pysrt
637
+ import librosa
638
+ import soundfile as sf
639
+ from tqdm.auto import tqdm
640
+ from pydub import AudioSegment
641
+ from deep_translator import GoogleTranslator
642
+
643
+
644
+ # ---------------------- Utility Functions ----------------------
645
+ def get_current_time():
646
+ return datetime.datetime.now().strftime("%I_%M_%p")
647
+
648
+ def get_subtitle_Dub_path(srt_file_path, Language):
649
+ file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
650
+ full_base_path = os.path.join(os.getcwd(), "TTS_DUB")
651
+ os.makedirs(full_base_path, exist_ok=True)
652
+ random_string = str(uuid.uuid4())[:6]
653
+ lang = language_map_local.get(Language, Language.replace(" ", "_"))
654
+ new_path = os.path.join(full_base_path, f"{file_name}_{lang}_{random_string}.wav")
655
+ return new_path.replace("__", "_")
656
+
657
+ def clean_srt(input_path):
658
+ def clean_srt_line(text):
659
+ for bad in ["[", "]", "♫"]:
660
+ text = text.replace(bad, "")
661
+ return text.strip()
662
+
663
+ subs = pysrt.open(input_path, encoding='utf-8')
664
+ output_path = input_path.lower().replace(".srt", "") + "_.srt"
665
+ with open(output_path, "w", encoding='utf-8') as file:
666
+ for sub in subs:
667
+ file.write(f"{sub.index}\n{sub.start} --> {sub.end}\n{clean_srt_line(sub.text)}\n\n")
668
+ return output_path
669
+
670
+ def translate_srt(input_path, target_language="Hindi", max_segments=500, chunk_size=4000):
671
+ output_path = input_path.replace(".srt", f"{target_language}.srt")
672
+ subs = pysrt.open(input_path, encoding='utf-8')
673
+ if len(subs) > max_segments:
674
+ gr.Warning(f"Too many segments: {len(subs)} > {max_segments}. Skipping translation.")
675
+ return input_path
676
+
677
+ original = [f"<#{i}>{s.text}" for i, s in enumerate(subs)]
678
+ full_text = "\n".join(original)
679
+
680
+ chunks, start = [], 0
681
+ while start < len(full_text):
682
+ end = start + chunk_size
683
+ split_point = full_text.rfind("<#", start, end) if end < len(full_text) else len(full_text)
684
+ chunks.append(full_text[start:split_point])
685
+ start = split_point
686
+
687
+ lang_code = language_map_local.get(target_language, "en")
688
+ translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
689
+ translated_text = "\n".join(translated_chunks)
690
+
691
+ pattern = re.compile(r"<#(\d+)>(.*?)(?=<#\d+>|$)", re.DOTALL)
692
+ translated_dict = {int(i): txt.strip() for i, txt in pattern.findall(translated_text)}
693
+
694
+ for i, sub in enumerate(subs):
695
+ sub.text = translated_dict.get(i, sub.text)
696
+
697
+ subs.save(output_path, encoding='utf-8')
698
+ return output_path
699
+
700
+ def prepare_srt(srt_path, target_language, translate=False):
701
+ path = clean_srt(srt_path)
702
+ return translate_srt(path, target_language) if translate else path
703
+
704
+
705
+ def is_ffmpeg_installed():
706
+ ffmpeg_exe = "ffmpeg.exe" if platform.system() == "Windows" else "ffmpeg"
707
+ try:
708
+ subprocess.run([ffmpeg_exe, "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
709
+ return True, ffmpeg_exe
710
+ except Exception:
711
+ gr.Warning("FFmpeg not found. Falling back to librosa for audio speedup.", duration=20)
712
+ return False, ffmpeg_exe
713
+
714
+ def speedup_audio_librosa(input_file, output_file, speedup_factor):
715
+ try:
716
+ y, sr = librosa.load(input_file, sr=None)
717
+ y_stretched = librosa.effects.time_stretch(y, rate=speedup_factor)
718
+ sf.write(output_file, y_stretched, sr)
719
+ except Exception as e:
720
+ gr.Warning(f"Librosa speedup failed: {e}")
721
+ shutil.copy(input_file, output_file)
722
+
723
+ def change_speed(input_file, output_file, speedup_factor, use_ffmpeg, ffmpeg_path):
724
+ if use_ffmpeg:
725
+ try:
726
+ subprocess.run([ffmpeg_path, "-i", input_file, "-filter:a", f"atempo={speedup_factor}", output_file, "-y"], check=True)
727
+ except Exception as e:
728
+ gr.Error(f"FFmpeg speedup error: {e}")
729
+ speedup_audio_librosa(input_file, output_file, speedup_factor)
730
+ else:
731
+ speedup_audio_librosa(input_file, output_file, speedup_factor)
732
+
733
+ def remove_edge_silence(input_path, output_path):
734
+ y, sr = librosa.load(input_path, sr=None)
735
+ trimmed_audio, _ = librosa.effects.trim(y, top_db=30)
736
+ sf.write(output_path, trimmed_audio, sr)
737
+ return output_path
738
+
739
+
740
+ # ---------------------- Main Class ----------------------
741
+ class SRTDubbing:
742
+ def __init__(self, use_ffmpeg=True, ffmpeg_path="ffmpeg"):
743
+ self.use_ffmpeg = use_ffmpeg
744
+ self.ffmpeg_path = ffmpeg_path
745
+ self.cache_dir = "./cache"
746
+ os.makedirs("./dummy", exist_ok=True)
747
+ os.makedirs(self.cache_dir, exist_ok=True)
748
+
749
+ @staticmethod
750
+ def convert_to_millisecond(t):
751
+ return t.hours * 3600000 + t.minutes * 60000 + t.seconds * 1000 + int(t.milliseconds)
752
+
753
+ @staticmethod
754
+ def read_srt_file(file_path):
755
+ subs = pysrt.open(file_path, encoding='utf-8')
756
+ entries = []
757
+ prev_end = 0
758
+ for idx, sub in enumerate(subs, 1):
759
+ start, end = SRTDubbing.convert_to_millisecond(sub.start), SRTDubbing.convert_to_millisecond(sub.end)
760
+ pause = start - prev_end if idx > 1 else start
761
+ entries.append({
762
+ 'entry_number': idx,
763
+ 'start_time': start,
764
+ 'end_time': end,
765
+ 'text': sub.text.strip(),
766
+ 'pause_time': pause,
767
+ 'audio_name': f"{idx}.wav",
768
+ 'previous_pause': f"{idx}_before_pause.wav",
769
+ })
770
+ prev_end = end
771
+ return entries
772
+
773
+ def text_to_speech_srt(self, text, audio_path, language, voice, actual_duration):
774
+ temp = "./cache/temp.wav"
775
+ # Step 1: Generate initial audio
776
+ path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1, remove_silence=False, keep_silence_up_to=0.05)
777
+ # ✂️ Remove leading and trailing silence to make timing tight without trimming actual speech.
778
+ remove_edge_silence(path, temp)
779
+ # 📏 Load the trimmed audio and get its duration in milliseconds.
780
+ audio = AudioSegment.from_file(temp)
781
+
782
+ # ⏱️ If no duration is specified (edge case), use the TTS as-is without speed/timing adjustments.
783
+ if actual_duration == 0:
784
+ shutil.move(temp, audio_path)
785
+ return
786
+
787
+ # Step 2: If TTS audio is longer, retry with remove_silence=True
788
+ if len(audio) > actual_duration:
789
+ path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=1, remove_silence=True, keep_silence_up_to=0.05)
790
+ remove_edge_silence(path, temp)
791
+ audio = AudioSegment.from_file(temp)
792
+
793
+ # Step 3: If still longer → speed up
794
+ if len(audio) > actual_duration:
795
+ factor = len(audio) / actual_duration
796
+ path, _ = generate_and_save_audio(text, Language=language, voice=voice, speed=factor, remove_silence=True, keep_silence_up_to=0.05)
797
+ remove_edge_silence(path, temp)
798
+ audio = AudioSegment.from_file(temp)
799
+
800
+ # Final Adjustment: Speed up via FFmpeg or librosa
801
+ if len(audio) > actual_duration:
802
+ factor = len(audio) / actual_duration
803
+ final_temp = "./cache/speedup_temp.wav"
804
+ change_speed(temp, final_temp, factor, self.use_ffmpeg, self.ffmpeg_path)
805
+ shutil.move(final_temp, audio_path)
806
+
807
+ # Add silence if too short
808
+ elif len(audio) < actual_duration:
809
+ silence = AudioSegment.silent(duration=actual_duration - len(audio))
810
+ (audio + silence).export(audio_path, format="wav")
811
+ # ➡️ Fallback: If TTS already perfectly matches subtitle duration, save as-is.
812
+ else:
813
+ shutil.move(temp, audio_path) #bad code
814
+
815
+ @staticmethod
816
+ def make_silence(duration, path):
817
+ AudioSegment.silent(duration=duration).export(path, format="wav")
818
+
819
+ @staticmethod
820
+ def create_folder_for_srt(srt_file_path):
821
+ base = os.path.splitext(os.path.basename(srt_file_path))[0]
822
+ folder = f"./dummy/{base}_{str(uuid.uuid4())[:4]}"
823
+ os.makedirs(folder, exist_ok=True)
824
+ return folder
825
+
826
+ @staticmethod
827
+ def concatenate_audio_files(paths, output):
828
+ audio = sum([AudioSegment.from_file(p) for p in paths], AudioSegment.silent(duration=0))
829
+ audio.export(output, format="wav")
830
+
831
+ def srt_to_dub(self, srt_path, output_path, language, voice):
832
+ entries = self.read_srt_file(srt_path)
833
+ folder = self.create_folder_for_srt(srt_path)
834
+ all_audio = []
835
+ for entry in tqdm(entries):
836
+ self.make_silence(entry['pause_time'], os.path.join(folder, entry['previous_pause']))
837
+ all_audio.append(os.path.join(folder, entry['previous_pause']))
838
+
839
+ tts_path = os.path.join(folder, entry['audio_name'])
840
+ self.text_to_speech_srt(entry['text'], tts_path, language, voice, entry['end_time'] - entry['start_time'])
841
+ all_audio.append(tts_path)
842
+
843
+ self.concatenate_audio_files(all_audio, output_path)
844
+
845
+
846
+ # ---------------------- Entrypoint ----------------------
847
+ def srt_process(srt_path, Language="American English", voice_name="af_bella", translate=False):
848
+ if not srt_path.endswith(".srt"):
849
+ gr.Error("Please upload a valid .srt file", duration=5)
850
+ return None
851
+
852
+ use_ffmpeg, ffmpeg_path = is_ffmpeg_installed()
853
+ processed_srt = prepare_srt(srt_path, Language, translate)
854
+ output_path = get_subtitle_Dub_path(srt_path, Language)
855
+
856
+ SRTDubbing(use_ffmpeg, ffmpeg_path).srt_to_dub(processed_srt, output_path, Language, voice_name)
857
+ return output_path
858
+
859
+ def subtitle_ui():
860
+ with gr.Blocks() as demo:
861
+
862
+ gr.Markdown(
863
+ """
864
+ # Generate Audio File From Subtitle [Upload Only .srt file]
865
+
866
+ To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
867
+
868
+ """
869
+ )
870
+ with gr.Row():
871
+ with gr.Column():
872
+ srt_file = gr.File(label='Upload .srt Subtitle File Only')
873
+ with gr.Row():
874
+ language_name = gr.Dropdown(lang_list, label="🌍 Select Language", value=lang_list[0])
875
+ with gr.Row():
876
+ voice = gr.Dropdown(
877
+ voice_names,
878
+ value='af_bella',
879
+ allow_custom_value=False,
880
+ label='🎙️ Choose VoicePack',
881
+ )
882
+ with gr.Row():
883
+ generate_btn_ = gr.Button('Generate', variant='primary')
884
+
885
+ with gr.Accordion('Other Settings', open=False):
886
+ translate_text = gr.Checkbox(value=False, label='🌐 Translate Text to Selected Language')
887
+
888
+
889
+
890
+ with gr.Column():
891
+ audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
892
+ with gr.Accordion('Enable Autoplay', open=False):
893
+ autoplay = gr.Checkbox(value=True, label='Autoplay')
894
+ autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
895
+
896
+ # srt_file.submit(
897
+ # srt_process,
898
+ # inputs=[srt_file, voice],
899
+ # outputs=[audio]
900
+ # )
901
+ generate_btn_.click(
902
+ srt_process,
903
+ inputs=[srt_file,language_name,voice,translate_text],
904
+ outputs=[audio]
905
+ )
906
+ return demo
907
+
908
+
909
+
910
+ # Example usage:
911
+ # srt_file_path = "/content/me.srt"
912
+ # dub_audio_path = srt_process(srt_file_path, Language="American English", voice_name="af_bella", translate=False)
913
+ # print(f"Audio file saved at: {dub_audio_path}")
914
 
915
  import click
916
  @click.command()
 
919
  def main(debug, share):
920
  # def main(debug=True, share=True):
921
  demo1 = ui()
922
+ demo2 = subtitle_ui()
923
+ demo3 = tutorial()
924
+ demo = gr.TabbedInterface([demo1, demo2,demo3],["Multilingual TTS","SRT Dubbing","VoicePack Explanation"],title="Kokoro TTS")#,theme='JohnSmith9982/small_and_pretty')
925
  demo.queue().launch(debug=debug, share=share)
926
  # demo.queue().launch(debug=debug, share=share,server_port=9000)
927
  #Run on local network
 
936
  pipeline = KPipeline(lang_code=last_used_language)
937
  temp_folder = create_audio_dir()
938
  if __name__ == "__main__":
939
+ main()