TDN-M commited on
Commit
b9b18bd
·
verified ·
1 Parent(s): ef53e94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -13
app.py CHANGED
@@ -13,7 +13,13 @@ from huggingface_hub import HfApi, hf_hub_download, snapshot_download
13
  from TTS.tts.configs.xtts_config import XttsConfig
14
  from TTS.tts.models.xtts import Xtts
15
  from vinorm import TTSnorm
16
- from content_generation import create_content # Nhập hàm create_content từ file content_generation.py
 
 
 
 
 
 
17
 
18
  # download for mecab
19
  os.system("python -m unidic download")
@@ -52,6 +58,20 @@ supported_languages = config.languages
52
  if not "vi" in supported_languages:
53
  supported_languages.append("vi")
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def normalize_vietnamese_text(text):
56
  text = (
57
  TTSnorm(text, unknown=False, lower=False, rule=True)
@@ -80,6 +100,25 @@ def calculate_keep_len(text, lang):
80
  return 13000 * word_count + 2000 * num_punct
81
  return -1
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  @spaces.GPU
84
  def predict(
85
  prompt,
@@ -100,12 +139,12 @@ def predict(
100
  metrics_text = gr.Warning(
101
  f"Language you put {language} in is not in our Supported Languages, please choose from dropdown"
102
  )
103
- return (None, metrics_text)
104
 
105
  speaker_wav = audio_file_pth
106
  if len(prompt) < 2:
107
  metrics_text = gr.Warning("Please give a longer prompt text")
108
- return (None, metrics_text)
109
 
110
  try:
111
  metrics_text = ""
@@ -125,7 +164,7 @@ def predict(
125
  metrics_text = gr.Warning(
126
  "It appears something wrong with reference, did you unmute your microphone?"
127
  )
128
- return (None, metrics_text)
129
 
130
  prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
131
  if normalize_text and language == "vi":
@@ -154,6 +193,17 @@ def predict(
154
  keep_len = calculate_keep_len(prompt, language)
155
  out["wav"] = out["wav"][:keep_len]
156
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
 
 
 
 
 
 
 
 
 
 
 
157
  except RuntimeError as e:
158
  if "device-side assert" in str(e):
159
  # cannot do anything on cuda device side error, need to restart
@@ -212,8 +262,8 @@ def predict(
212
  metrics_text = gr.Warning(
213
  "Something unexpected happened please retry again."
214
  )
215
- return (None, metrics_text)
216
- return ("output.wav", metrics_text)
217
 
218
  # Cập nhật giao diện Gradio
219
  with gr.Blocks(analytics_enabled=False) as demo:
@@ -221,7 +271,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
221
  with gr.Column():
222
  gr.Markdown(
223
  """
224
- # tts@TDNM ✨ https:www.tdn-m.com
225
  """
226
  )
227
  with gr.Column():
@@ -231,9 +281,9 @@ with gr.Blocks(analytics_enabled=False) as demo:
231
  with gr.Row():
232
  with gr.Column():
233
  input_text_gr = gr.Textbox(
234
- label="Bạn cần nội dung gì?",
235
- info="Tôi thể viết thu âm luôn cho bạn",
236
- value="Lời t��� sự của AI, 150 từ",
237
  )
238
  language_gr = gr.Dropdown(
239
  label="Language (Ngôn ngữ)",
@@ -268,7 +318,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
268
  use_llm_checkbox = gr.Checkbox(
269
  label="Sử dụng LLM để tạo nội dung",
270
  info="Use LLM to generate content",
271
- value=True,
272
  )
273
  content_type_dropdown = gr.Dropdown(
274
  label="Loại nội dung",
@@ -278,7 +328,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
278
  ref_gr = gr.Audio(
279
  label="Reference Audio (Giọng mẫu)",
280
  type="filepath",
281
- value="nam-tai-llieu.wav",
282
  )
283
  tts_button = gr.Button(
284
  "Đọc 🗣️🔥",
@@ -289,6 +339,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
289
 
290
  with gr.Column():
291
  audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
 
292
  out_text_gr = gr.Text(label="Metrics")
293
 
294
  tts_button.click(
@@ -301,7 +352,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
301
  use_llm_checkbox, # Thêm checkbox để bật/tắt LLM
302
  content_type_dropdown, # Thêm dropdown để chọn loại nội dung
303
  ],
304
- outputs=[audio_gr, out_text_gr],
305
  api_name="predict",
306
  )
307
 
 
13
  from TTS.tts.configs.xtts_config import XttsConfig
14
  from TTS.tts.models.xtts import Xtts
15
  from vinorm import TTSnorm
16
+ from langchain.llms import HuggingFacePipeline
17
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
18
+ from components import caption_chain, tag_chain
19
+ from components import pexels, utils
20
+ import cv2
21
+ from moviepy.editor import AudioFileClip, ImageSequenceClip
22
+ import gc
23
 
24
  # download for mecab
25
  os.system("python -m unidic download")
 
58
  if not "vi" in supported_languages:
59
  supported_languages.append("vi")
60
 
61
+ # Load LangChain components
62
+ model = AutoModelForSeq2SeqLM.from_pretrained("declare-lab/flan-alpaca-large")
63
+ tokenizer = AutoTokenizer.from_pretrained("declare-lab/flan-alpaca-large")
64
+ pipe = pipeline(
65
+ 'text2text-generation',
66
+ model=model,
67
+ tokenizer=tokenizer,
68
+ max_length=120
69
+ )
70
+ local_llm = HuggingFacePipeline(pipeline=pipe)
71
+ llm_chain = caption_chain.chain(llm=local_llm)
72
+ sum_llm_chain = tag_chain.chain(llm=local_llm)
73
+ pexels_api_key = os.getenv('pexels_api_key')
74
+
75
  def normalize_vietnamese_text(text):
76
  text = (
77
  TTSnorm(text, unknown=False, lower=False, rule=True)
 
100
  return 13000 * word_count + 2000 * num_punct
101
  return -1
102
 
103
+ def create_video_from_audio(audio_path, images, output_path):
104
+ audio_clip = AudioFileClip(audio_path)
105
+ duration = audio_clip.duration
106
+
107
+ # Calculate frame rate based on number of images and audio duration
108
+ frame_rate = len(images) / duration
109
+
110
+ # Create video clip from images
111
+ video_clip = ImageSequenceClip(images, fps=frame_rate)
112
+
113
+ # Set audio for video clip
114
+ final_clip = video_clip.set_audio(audio_clip)
115
+
116
+ # Write result to file
117
+ final_clip.write_videofile(output_path, codec='libx264', audio_codec='aac')
118
+ audio_clip.close()
119
+ video_clip.close()
120
+ final_clip.close()
121
+
122
  @spaces.GPU
123
  def predict(
124
  prompt,
 
139
  metrics_text = gr.Warning(
140
  f"Language you put {language} in is not in our Supported Languages, please choose from dropdown"
141
  )
142
+ return (None, None, metrics_text)
143
 
144
  speaker_wav = audio_file_pth
145
  if len(prompt) < 2:
146
  metrics_text = gr.Warning("Please give a longer prompt text")
147
+ return (None, None, metrics_text)
148
 
149
  try:
150
  metrics_text = ""
 
164
  metrics_text = gr.Warning(
165
  "It appears something wrong with reference, did you unmute your microphone?"
166
  )
167
+ return (None, None, metrics_text)
168
 
169
  prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
170
  if normalize_text and language == "vi":
 
193
  keep_len = calculate_keep_len(prompt, language)
194
  out["wav"] = out["wav"][:keep_len]
195
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
196
+
197
+ # Tạo video từ file audio
198
+ print("I: Generating video from audio...")
199
+ folder_name, sentences = pexels.generate_videos(prompt, pexels_api_key, "landscape", 1080, 1920, llm_chain, sum_llm_chain)
200
+ utils.combine_videos(folder_name)
201
+ video_path = os.path.join(folder_name, "Final_Ad_Video.mp4")
202
+
203
+ print(f"I: Video generated at {video_path}")
204
+ metrics_text += f"Video generated at {video_path}\n"
205
+
206
+ return ("output.wav", video_path, metrics_text)
207
  except RuntimeError as e:
208
  if "device-side assert" in str(e):
209
  # cannot do anything on cuda device side error, need to restart
 
262
  metrics_text = gr.Warning(
263
  "Something unexpected happened please retry again."
264
  )
265
+ return (None, None, metrics_text)
266
+ return ("output.wav", None, metrics_text)
267
 
268
  # Cập nhật giao diện Gradio
269
  with gr.Blocks(analytics_enabled=False) as demo:
 
271
  with gr.Column():
272
  gr.Markdown(
273
  """
274
+ # tts@TDNM ✨ https://www.tdn-m.com
275
  """
276
  )
277
  with gr.Column():
 
281
  with gr.Row():
282
  with gr.Column():
283
  input_text_gr = gr.Textbox(
284
+ label="Text Prompt (Văn bản cần đọc)",
285
+ info="Mỗi câu nên từ 10 từ trở lên.",
286
+ value="Xin chào, tôi một hình chuyển đổi văn bản thành giọng nói tiếng Việt.",
287
  )
288
  language_gr = gr.Dropdown(
289
  label="Language (Ngôn ngữ)",
 
318
  use_llm_checkbox = gr.Checkbox(
319
  label="Sử dụng LLM để tạo nội dung",
320
  info="Use LLM to generate content",
321
+ value=False,
322
  )
323
  content_type_dropdown = gr.Dropdown(
324
  label="Loại nội dung",
 
328
  ref_gr = gr.Audio(
329
  label="Reference Audio (Giọng mẫu)",
330
  type="filepath",
331
+ source="upload", # Đảm bảo tệp âm thanh được tải lên bởi người dùng
332
  )
333
  tts_button = gr.Button(
334
  "Đọc 🗣️🔥",
 
339
 
340
  with gr.Column():
341
  audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
342
+ video_gr = gr.Video(label="Generated Video")
343
  out_text_gr = gr.Text(label="Metrics")
344
 
345
  tts_button.click(
 
352
  use_llm_checkbox, # Thêm checkbox để bật/tắt LLM
353
  content_type_dropdown, # Thêm dropdown để chọn loại nội dung
354
  ],
355
+ outputs=[audio_gr, video_gr, out_text_gr],
356
  api_name="predict",
357
  )
358