Voff

Sleeping

App Files Files Community

TDN-M commited on Feb 13

Commit

b9b18bd

verified ·

1 Parent(s): ef53e94

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -13

app.py CHANGED Viewed

@@ -13,7 +13,13 @@ from huggingface_hub import HfApi, hf_hub_download, snapshot_download
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
-from content_generation import create_content  # Nhập hàm create_content từ file content_generation.py
 # download for mecab
 os.system("python -m unidic download")
@@ -52,6 +58,20 @@ supported_languages = config.languages
 if not "vi" in supported_languages:
     supported_languages.append("vi")
 def normalize_vietnamese_text(text):
     text = (
         TTSnorm(text, unknown=False, lower=False, rule=True)
@@ -80,6 +100,25 @@ def calculate_keep_len(text, lang):
         return 13000 * word_count + 2000 * num_punct
     return -1
 @spaces.GPU
 def predict(
     prompt,
@@ -100,12 +139,12 @@ def predict(
         metrics_text = gr.Warning(
             f"Language you put {language} in is not in our Supported Languages, please choose from dropdown"
         )
-        return (None, metrics_text)
     speaker_wav = audio_file_pth
     if len(prompt) < 2:
         metrics_text = gr.Warning("Please give a longer prompt text")
-        return (None, metrics_text)
     try:
         metrics_text = ""
@@ -125,7 +164,7 @@ def predict(
             metrics_text = gr.Warning(
                 "It appears something wrong with reference, did you unmute your microphone?"
             )
-            return (None, metrics_text)
         prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
         if normalize_text and language == "vi":
@@ -154,6 +193,17 @@ def predict(
         keep_len = calculate_keep_len(prompt, language)
         out["wav"] = out["wav"][:keep_len]
         torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
     except RuntimeError as e:
         if "device-side assert" in str(e):
             # cannot do anything on cuda device side error, need to restart
@@ -212,8 +262,8 @@ def predict(
                 metrics_text = gr.Warning(
                     "Something unexpected happened please retry again."
                 )
-            return (None, metrics_text)
-    return ("output.wav", metrics_text)
 # Cập nhật giao diện Gradio
 with gr.Blocks(analytics_enabled=False) as demo:
@@ -221,7 +271,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
         with gr.Column():
             gr.Markdown(
                 """
-                # tts@TDNM ✨ https:www.tdn-m.com
                 """
             )
         with gr.Column():
@@ -231,9 +281,9 @@ with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Row():
         with gr.Column():
             input_text_gr = gr.Textbox(
-                label="Bạn cần nội dung gì?",
-                info="Tôi có thể viết và thu âm luôn cho bạn",
-                value="Lời t��� sự của AI, 150 từ",
             )
             language_gr = gr.Dropdown(
                 label="Language (Ngôn ngữ)",
@@ -268,7 +318,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
             use_llm_checkbox = gr.Checkbox(
                 label="Sử dụng LLM để tạo nội dung",
                 info="Use LLM to generate content",
-                value=True,
             )
             content_type_dropdown = gr.Dropdown(
                 label="Loại nội dung",
@@ -278,7 +328,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
             ref_gr = gr.Audio(
                 label="Reference Audio (Giọng mẫu)",
                 type="filepath",
-                value="nam-tai-llieu.wav",
             )
             tts_button = gr.Button(
                 "Đọc 🗣️🔥",
@@ -289,6 +339,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
         with gr.Column():
             audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
             out_text_gr = gr.Text(label="Metrics")
     tts_button.click(
@@ -301,7 +352,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
             use_llm_checkbox,  # Thêm checkbox để bật/tắt LLM
             content_type_dropdown,  # Thêm dropdown để chọn loại nội dung
         ],
-        outputs=[audio_gr, out_text_gr],
         api_name="predict",
     )

 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
+from langchain.llms import HuggingFacePipeline
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+from components import caption_chain, tag_chain
+from components import pexels, utils
+import cv2
+from moviepy.editor import AudioFileClip, ImageSequenceClip
+import gc
 # download for mecab
 os.system("python -m unidic download")
 if not "vi" in supported_languages:
     supported_languages.append("vi")
+# Load LangChain components
+model = AutoModelForSeq2SeqLM.from_pretrained("declare-lab/flan-alpaca-large")
+tokenizer = AutoTokenizer.from_pretrained("declare-lab/flan-alpaca-large")
+pipe = pipeline(
+    'text2text-generation',
+    model=model,
+    tokenizer=tokenizer,
+    max_length=120
+)
+local_llm = HuggingFacePipeline(pipeline=pipe)
+llm_chain = caption_chain.chain(llm=local_llm)
+sum_llm_chain = tag_chain.chain(llm=local_llm)
+pexels_api_key = os.getenv('pexels_api_key')
 def normalize_vietnamese_text(text):
     text = (
         TTSnorm(text, unknown=False, lower=False, rule=True)
         return 13000 * word_count + 2000 * num_punct
     return -1
+def create_video_from_audio(audio_path, images, output_path):
+    audio_clip = AudioFileClip(audio_path)
+    duration = audio_clip.duration
+    # Calculate frame rate based on number of images and audio duration
+    frame_rate = len(images) / duration
+    # Create video clip from images
+    video_clip = ImageSequenceClip(images, fps=frame_rate)
+    # Set audio for video clip
+    final_clip = video_clip.set_audio(audio_clip)
+    # Write result to file
+    final_clip.write_videofile(output_path, codec='libx264', audio_codec='aac')
+    audio_clip.close()
+    video_clip.close()
+    final_clip.close()
 @spaces.GPU
 def predict(
     prompt,
         metrics_text = gr.Warning(
             f"Language you put {language} in is not in our Supported Languages, please choose from dropdown"
         )
+        return (None, None, metrics_text)
     speaker_wav = audio_file_pth
     if len(prompt) < 2:
         metrics_text = gr.Warning("Please give a longer prompt text")
+        return (None, None, metrics_text)
     try:
         metrics_text = ""
             metrics_text = gr.Warning(
                 "It appears something wrong with reference, did you unmute your microphone?"
             )
+            return (None, None, metrics_text)
         prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
         if normalize_text and language == "vi":
         keep_len = calculate_keep_len(prompt, language)
         out["wav"] = out["wav"][:keep_len]
         torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
+        # Tạo video từ file audio
+        print("I: Generating video from audio...")
+        folder_name, sentences = pexels.generate_videos(prompt, pexels_api_key, "landscape", 1080, 1920, llm_chain, sum_llm_chain)
+        utils.combine_videos(folder_name)
+        video_path = os.path.join(folder_name, "Final_Ad_Video.mp4")
+        print(f"I: Video generated at {video_path}")
+        metrics_text += f"Video generated at {video_path}\n"
+        return ("output.wav", video_path, metrics_text)
     except RuntimeError as e:
         if "device-side assert" in str(e):
             # cannot do anything on cuda device side error, need to restart
                 metrics_text = gr.Warning(
                     "Something unexpected happened please retry again."
                 )
+            return (None, None, metrics_text)
+    return ("output.wav", None, metrics_text)
 # Cập nhật giao diện Gradio
 with gr.Blocks(analytics_enabled=False) as demo:
         with gr.Column():
             gr.Markdown(
                 """
+                # tts@TDNM ✨ https://www.tdn-m.com
                 """
             )
         with gr.Column():
     with gr.Row():
         with gr.Column():
             input_text_gr = gr.Textbox(
+                label="Text Prompt (Văn bản cần đọc)",
+                info="Mỗi câu nên từ 10 từ trở lên.",
+                value="Xin chào, tôi là một mô hình chuyển đổi văn bản thành giọng nói tiếng Việt.",
             )
             language_gr = gr.Dropdown(
                 label="Language (Ngôn ngữ)",
             use_llm_checkbox = gr.Checkbox(
                 label="Sử dụng LLM để tạo nội dung",
                 info="Use LLM to generate content",
+                value=False,
             )
             content_type_dropdown = gr.Dropdown(
                 label="Loại nội dung",
             ref_gr = gr.Audio(
                 label="Reference Audio (Giọng mẫu)",
                 type="filepath",
+                source="upload",  # Đảm bảo tệp âm thanh được tải lên bởi người dùng
             )
             tts_button = gr.Button(
                 "Đọc 🗣️🔥",
         with gr.Column():
             audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
+            video_gr = gr.Video(label="Generated Video")
             out_text_gr = gr.Text(label="Metrics")
     tts_button.click(
             use_llm_checkbox,  # Thêm checkbox để bật/tắt LLM
             content_type_dropdown,  # Thêm dropdown để chọn loại nội dung
         ],
+        outputs=[audio_gr, video_gr, out_text_gr],
         api_name="predict",
     )