TDN-M commited on
Commit
0fca0fa
·
verified ·
1 Parent(s): 3285ef5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -8
app.py CHANGED
@@ -15,8 +15,8 @@ from TTS.tts.models.xtts import Xtts
15
  from vinorm import TTSnorm
16
  from langchain.llms import HuggingFacePipeline
17
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
18
- from components import caption_chain, tag_chain
19
- from components import pexels, utils
20
  import cv2
21
  from moviepy.editor import AudioFileClip, ImageSequenceClip
22
  import gc
@@ -73,6 +73,12 @@ llm_chain = caption_chain.chain(llm=local_llm)
73
  sum_llm_chain = tag_chain.chain(llm=local_llm)
74
  pexels_api_key = os.getenv('pexels_api_key')
75
 
 
 
 
 
 
 
76
  def normalize_vietnamese_text(text):
77
  text = (
78
  TTSnorm(text, unknown=False, lower=False, rule=True)
@@ -101,7 +107,7 @@ def calculate_keep_len(text, lang):
101
  return 13000 * word_count + 2000 * num_punct
102
  return -1
103
 
104
- def create_video_from_audio(audio_path, images, output_path):
105
  audio_clip = AudioFileClip(audio_path)
106
  duration = audio_clip.duration
107
 
@@ -128,6 +134,19 @@ def truncate_prompt(prompt, tokenizer, max_length=512):
128
  prompt = tokenizer.convert_tokens_to_string(tokens)
129
  return prompt
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  @spaces.GPU
132
  def predict(
133
  prompt,
@@ -207,18 +226,23 @@ def predict(
207
  out["wav"] = out["wav"][:keep_len]
208
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
209
 
210
- # Tạo video từ file audio
211
- print("I: Generating video from audio...")
212
  # Sử dụng UUID để tạo tên thư mục ngắn gọn
213
  folder_name = f"video_{uuid.uuid4().hex}"
214
  os.makedirs(folder_name, exist_ok=True)
215
  folder_path = os.path.join(folder_name, "images")
216
  os.makedirs(folder_path, exist_ok=True)
217
 
218
- # Tạo video từ file audio và các hình ảnh
219
- folder_name, sentences = pexels.generate_videos(prompt, pexels_api_key, "landscape", 1080, 1920, llm_chain, sum_llm_chain)
220
- utils.combine_videos(folder_name)
 
 
 
 
221
  video_path = os.path.join(folder_name, "Final_Ad_Video.mp4")
 
222
 
223
  print(f"I: Video generated at {video_path}")
224
  metrics_text += f"Video generated at {video_path}\n"
 
15
  from vinorm import TTSnorm
16
  from langchain.llms import HuggingFacePipeline
17
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
18
+ from diffusers import StableDiffusionPipeline
19
+ from PIL import Image
20
  import cv2
21
  from moviepy.editor import AudioFileClip, ImageSequenceClip
22
  import gc
 
73
  sum_llm_chain = tag_chain.chain(llm=local_llm)
74
  pexels_api_key = os.getenv('pexels_api_key')
75
 
76
+ # Initialize Stable Diffusion Pipeline with TDN-M/East-asian-beauty
77
+ image_gen_model_id = "TDN-M/East-asian-beauty"
78
+ device = "cuda" if torch.cuda.is_available() else "cpu"
79
+ image_generator = StableDiffusionPipeline.from_pretrained(image_gen_model_id, torch_dtype=torch.float16)
80
+ image_generator = image_generator.to(device)
81
+
82
  def normalize_vietnamese_text(text):
83
  text = (
84
  TTSnorm(text, unknown=False, lower=False, rule=True)
 
107
  return 13000 * word_count + 2000 * num_punct
108
  return -1
109
 
110
+ def create_video_from_audio_and_images(audio_path, images, output_path):
111
  audio_clip = AudioFileClip(audio_path)
112
  duration = audio_clip.duration
113
 
 
134
  prompt = tokenizer.convert_tokens_to_string(tokens)
135
  return prompt
136
 
137
+ def generate_images_from_sentences(sentences, image_generator, folder_path):
138
+ try:
139
+ for i, sentence in enumerate(sentences):
140
+ print(f"Generating image for sentence {i + 1}: {sentence}")
141
+ image = image_generator(sentence, guidance_scale=7.5).images[0]
142
+ image_path = os.path.join(folder_path, f"image_{i + 1}.png")
143
+ image.save(image_path)
144
+ print(f"Saved image at {image_path}")
145
+ except Exception as e:
146
+ print("Error! Failed generating images")
147
+ print(e)
148
+ return []
149
+
150
  @spaces.GPU
151
  def predict(
152
  prompt,
 
226
  out["wav"] = out["wav"][:keep_len]
227
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
228
 
229
+ # Tạo video từ file audio và các cảnh
230
+ print("I: Generating images from sentences...")
231
  # Sử dụng UUID để tạo tên thư mục ngắn gọn
232
  folder_name = f"video_{uuid.uuid4().hex}"
233
  os.makedirs(folder_name, exist_ok=True)
234
  folder_path = os.path.join(folder_name, "images")
235
  os.makedirs(folder_path, exist_ok=True)
236
 
237
+ # Tách các câu từ văn bản
238
+ sentences = [x.strip() for x in re.split(r'[.!?]', prompt) if len(x.strip()) > 6]
239
+
240
+ # Tạo ảnh minh họa cho từng câu
241
+ images = generate_images_from_sentences(sentences, image_generator, folder_path)
242
+
243
+ # Tạo video từ file audio và các ảnh
244
  video_path = os.path.join(folder_name, "Final_Ad_Video.mp4")
245
+ create_video_from_audio_and_images("output.wav", images, video_path)
246
 
247
  print(f"I: Video generated at {video_path}")
248
  metrics_text += f"Video generated at {video_path}\n"