Voff

Sleeping

App Files Files Community

TDN-M commited on Feb 12

Commit

b8fd884

verified ·

1 Parent(s): 08179b5

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -37

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ import re
 import time
 import uuid
 from io import StringIO
 import gradio as gr
 import spaces
 import torch
@@ -14,10 +13,10 @@ from huggingface_hub import HfApi, hf_hub_download, snapshot_download
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
 # download for mecab
 os.system("python -m unidic download")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 api = HfApi(token=HF_TOKEN)
@@ -26,9 +25,7 @@ print("Downloading if not downloaded viXTTS")
 checkpoint_dir = "model/"
 repo_id = "capleaf/viXTTS"
 use_deepspeed = False
 os.makedirs(checkpoint_dir, exist_ok=True)
 required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
 files_in_dir = os.listdir(checkpoint_dir)
 if not all(file in files_in_dir for file in required_files):
@@ -42,7 +39,6 @@ if not all(file in files_in_dir for file in required_files):
         filename="speakers_xtts.pth",
         local_dir=checkpoint_dir,
     )
 xtts_config = os.path.join(checkpoint_dir, "config.json")
 config = XttsConfig()
 config.load_json(xtts_config)
@@ -52,12 +48,10 @@ MODEL.load_checkpoint(
 )
 if torch.cuda.is_available():
     MODEL.cuda()
 supported_languages = config.languages
 if not "vi" in supported_languages:
     supported_languages.append("vi")
 def normalize_vietnamese_text(text):
     text = (
         TTSnorm(text, unknown=False, lower=False, rule=True)
@@ -70,59 +64,52 @@ def normalize_vietnamese_text(text):
         .replace("'", "")
         .replace("AI", "Ây Ai")
         .replace("A.I", "Ây Ai")
-        .replace("%"), "phần trăm"
     )
     return text
 def calculate_keep_len(text, lang):
     """Simple hack for short sentences"""
     if lang in ["ja", "zh-cn"]:
         return -1
     word_count = len(text.split())
     num_punct = text.count(".") + text.count("!") + text.count("?") + text.count(",")
     if word_count < 5:
         return 15000 * word_count + 2000 * num_punct
     elif word_count < 10:
         return 13000 * word_count + 2000 * num_punct
     return -1
 @spaces.GPU
 def predict(
     prompt,
     language,
     audio_file_pth,
     normalize_text=True,
 ):
     if language not in supported_languages:
         metrics_text = gr.Warning(
-            f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
         )
         return (None, metrics_text)
     speaker_wav = audio_file_pth
     if len(prompt) < 2:
         metrics_text = gr.Warning("Please give a longer prompt text")
         return (None, metrics_text)
-    # if len(prompt) > 250:
-    #     metrics_text = gr.Warning(
-    #         str(len(prompt))
-    #         + " characters.\n"
-    #         + "Your prompt is too long, please keep it under 250 characters\n"
-    #         + "Văn bản quá dài, vui lòng giữ dưới 250 ký tự."
-    #     )
-    #     return (None, metrics_text)
     try:
         metrics_text = ""
         t_latent = time.time()
         try:
             (
                 gpt_cond_latent,
@@ -133,7 +120,6 @@ def predict(
                 gpt_cond_chunk_len=4,
                 max_ref_length=60,
             )
         except Exception as e:
             print("Speaker encoding error", str(e))
             metrics_text = gr.Warning(
@@ -142,10 +128,8 @@ def predict(
             return (None, metrics_text)
         prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
         if normalize_text and language == "vi":
             prompt = normalize_vietnamese_text(prompt)
         print("I: Generating new audio...")
         t0 = time.time()
         out = MODEL.inference(
@@ -169,9 +153,7 @@ def predict(
         # Temporary hack for short sentences
         keep_len = calculate_keep_len(prompt, language)
         out["wav"] = out["wav"][:keep_len]
         torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
     except RuntimeError as e:
         if "device-side assert" in str(e):
             # cannot do anything on cuda device side error, need to restart
@@ -181,7 +163,6 @@ def predict(
             )
             gr.Warning("Unhandled Exception encounter, please retry in a minute")
             print("Cuda device-assert Runtime encountered need restart")
             error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
             error_data = [
                 error_time,
@@ -195,7 +176,6 @@ def predict(
             write_io = StringIO()
             csv.writer(write_io).writerows([error_data])
             csv_upload = write_io.getvalue().encode()
             filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
             print("Writing error csv")
             error_api = HfApi()
@@ -205,7 +185,6 @@ def predict(
                 repo_id="coqui/xtts-flagged-dataset",
                 repo_type="dataset",
             )
             # speaker_wav
             print("Writing error reference audio")
             speaker_filename = error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
@@ -216,19 +195,17 @@ def predict(
                 repo_id="coqui/xtts-flagged-dataset",
                 repo_type="dataset",
             )
             # HF Space specific.. This error is unrecoverable need to restart space
             space = api.get_space_runtime(repo_id=repo_id)
             if space.stage != "BUILDING":
                 api.restart_space(repo_id=repo_id)
             else:
                 print("TRIED TO RESTART but space is building")
         else:
             if "Failed to decode" in str(e):
                 print("Speaker encoding error", str(e))
                 metrics_text = gr.Warning(
-                    metrics_text="It appears something wrong with reference, did you unmute your microphone?"
                 )
             else:
                 print("RuntimeError: non device-side assert error:", str(e))
@@ -238,7 +215,7 @@ def predict(
             return (None, metrics_text)
     return ("output.wav", metrics_text)
 with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Row():
         with gr.Column():
@@ -288,6 +265,16 @@ with gr.Blocks(analytics_enabled=False) as demo:
                 info="Normalize Vietnamese text",
                 value=True,
             )
             ref_gr = gr.Audio(
                 label="Reference Audio (Giọng mẫu)",
                 type="filepath",
@@ -311,6 +298,8 @@ with gr.Blocks(analytics_enabled=False) as demo:
             language_gr,
             ref_gr,
             normalize_text,
         ],
         outputs=[audio_gr, out_text_gr],
         api_name="predict",

 import time
 import uuid
 from io import StringIO
 import gradio as gr
 import spaces
 import torch
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
+from content_generation import create_content  # Nhập hàm create_content từ file content_generation.py
 # download for mecab
 os.system("python -m unidic download")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 api = HfApi(token=HF_TOKEN)
 checkpoint_dir = "model/"
 repo_id = "capleaf/viXTTS"
 use_deepspeed = False
 os.makedirs(checkpoint_dir, exist_ok=True)
 required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
 files_in_dir = os.listdir(checkpoint_dir)
 if not all(file in files_in_dir for file in required_files):
         filename="speakers_xtts.pth",
         local_dir=checkpoint_dir,
     )
 xtts_config = os.path.join(checkpoint_dir, "config.json")
 config = XttsConfig()
 config.load_json(xtts_config)
 )
 if torch.cuda.is_available():
     MODEL.cuda()
 supported_languages = config.languages
 if not "vi" in supported_languages:
     supported_languages.append("vi")
 def normalize_vietnamese_text(text):
     text = (
         TTSnorm(text, unknown=False, lower=False, rule=True)
         .replace("'", "")
         .replace("AI", "Ây Ai")
         .replace("A.I", "Ây Ai")
+        .replace("%", "phần trăm")
     )
     return text
 def calculate_keep_len(text, lang):
     """Simple hack for short sentences"""
     if lang in ["ja", "zh-cn"]:
         return -1
     word_count = len(text.split())
     num_punct = text.count(".") + text.count("!") + text.count("?") + text.count(",")
     if word_count < 5:
         return 15000 * word_count + 2000 * num_punct
     elif word_count < 10:
         return 13000 * word_count + 2000 * num_punct
     return -1
 @spaces.GPU
 def predict(
     prompt,
     language,
     audio_file_pth,
     normalize_text=True,
+    use_llm=False,  # Thêm tùy chọn sử dụng LLM
+    content_type="Theo yêu cầu",  # Loại nội dung (ví dụ: "triết lý sống" hoặc "Theo yêu cầu")
 ):
+    if use_llm:
+        # Nếu sử dụng LLM, tạo nội dung văn bản từ đầu vào
+        print("I: Generating text with LLM...")
+        generated_text = create_content(prompt, content_type, language)
+        print(f"Generated text: {generated_text}")
+        prompt = generated_text  # Gán văn bản được tạo bởi LLM vào biến prompt
     if language not in supported_languages:
         metrics_text = gr.Warning(
+            f"Language you put {language} in is not in our Supported Languages, please choose from dropdown"
         )
         return (None, metrics_text)
     speaker_wav = audio_file_pth
     if len(prompt) < 2:
         metrics_text = gr.Warning("Please give a longer prompt text")
         return (None, metrics_text)
     try:
         metrics_text = ""
         t_latent = time.time()
         try:
             (
                 gpt_cond_latent,
                 gpt_cond_chunk_len=4,
                 max_ref_length=60,
             )
         except Exception as e:
             print("Speaker encoding error", str(e))
             metrics_text = gr.Warning(
             return (None, metrics_text)
         prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
         if normalize_text and language == "vi":
             prompt = normalize_vietnamese_text(prompt)
         print("I: Generating new audio...")
         t0 = time.time()
         out = MODEL.inference(
         # Temporary hack for short sentences
         keep_len = calculate_keep_len(prompt, language)
         out["wav"] = out["wav"][:keep_len]
         torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
     except RuntimeError as e:
         if "device-side assert" in str(e):
             # cannot do anything on cuda device side error, need to restart
             )
             gr.Warning("Unhandled Exception encounter, please retry in a minute")
             print("Cuda device-assert Runtime encountered need restart")
             error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
             error_data = [
                 error_time,
             write_io = StringIO()
             csv.writer(write_io).writerows([error_data])
             csv_upload = write_io.getvalue().encode()
             filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
             print("Writing error csv")
             error_api = HfApi()
                 repo_id="coqui/xtts-flagged-dataset",
                 repo_type="dataset",
             )
             # speaker_wav
             print("Writing error reference audio")
             speaker_filename = error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
                 repo_id="coqui/xtts-flagged-dataset",
                 repo_type="dataset",
             )
             # HF Space specific.. This error is unrecoverable need to restart space
             space = api.get_space_runtime(repo_id=repo_id)
             if space.stage != "BUILDING":
                 api.restart_space(repo_id=repo_id)
             else:
                 print("TRIED TO RESTART but space is building")
         else:
             if "Failed to decode" in str(e):
                 print("Speaker encoding error", str(e))
                 metrics_text = gr.Warning(
+                    "It appears something wrong with reference, did you unmute your microphone?"
                 )
             else:
                 print("RuntimeError: non device-side assert error:", str(e))
             return (None, metrics_text)
     return ("output.wav", metrics_text)
+# Cập nhật giao diện Gradio
 with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Row():
         with gr.Column():
                 info="Normalize Vietnamese text",
                 value=True,
             )
+            use_llm_checkbox = gr.Checkbox(
+                label="Sử dụng LLM để tạo nội dung",
+                info="Use LLM to generate content",
+                value=False,
+            )
+            content_type_dropdown = gr.Dropdown(
+                label="Loại nội dung",
+                choices=["triết lý sống", "Theo yêu cầu"],
+                value="Theo yêu cầu",
+            )
             ref_gr = gr.Audio(
                 label="Reference Audio (Giọng mẫu)",
                 type="filepath",
             language_gr,
             ref_gr,
             normalize_text,
+            use_llm_checkbox,  # Thêm checkbox để bật/tắt LLM
+            content_type_dropdown,  # Thêm dropdown để chọn loại nội dung
         ],
         outputs=[audio_gr, out_text_gr],
         api_name="predict",