Update app.py
Browse files
app.py
CHANGED
@@ -15,8 +15,8 @@ from TTS.tts.models.xtts import Xtts
|
|
15 |
from vinorm import TTSnorm
|
16 |
from langchain.llms import HuggingFacePipeline
|
17 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
18 |
-
from
|
19 |
-
from
|
20 |
import cv2
|
21 |
from moviepy.editor import AudioFileClip, ImageSequenceClip
|
22 |
import gc
|
@@ -73,6 +73,12 @@ llm_chain = caption_chain.chain(llm=local_llm)
|
|
73 |
sum_llm_chain = tag_chain.chain(llm=local_llm)
|
74 |
pexels_api_key = os.getenv('pexels_api_key')
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
def normalize_vietnamese_text(text):
|
77 |
text = (
|
78 |
TTSnorm(text, unknown=False, lower=False, rule=True)
|
@@ -101,7 +107,7 @@ def calculate_keep_len(text, lang):
|
|
101 |
return 13000 * word_count + 2000 * num_punct
|
102 |
return -1
|
103 |
|
104 |
-
def
|
105 |
audio_clip = AudioFileClip(audio_path)
|
106 |
duration = audio_clip.duration
|
107 |
|
@@ -128,6 +134,19 @@ def truncate_prompt(prompt, tokenizer, max_length=512):
|
|
128 |
prompt = tokenizer.convert_tokens_to_string(tokens)
|
129 |
return prompt
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
@spaces.GPU
|
132 |
def predict(
|
133 |
prompt,
|
@@ -207,18 +226,23 @@ def predict(
|
|
207 |
out["wav"] = out["wav"][:keep_len]
|
208 |
torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
209 |
|
210 |
-
# Tạo video từ file audio
|
211 |
-
print("I: Generating
|
212 |
# Sử dụng UUID để tạo tên thư mục ngắn gọn
|
213 |
folder_name = f"video_{uuid.uuid4().hex}"
|
214 |
os.makedirs(folder_name, exist_ok=True)
|
215 |
folder_path = os.path.join(folder_name, "images")
|
216 |
os.makedirs(folder_path, exist_ok=True)
|
217 |
|
218 |
-
#
|
219 |
-
|
220 |
-
|
|
|
|
|
|
|
|
|
221 |
video_path = os.path.join(folder_name, "Final_Ad_Video.mp4")
|
|
|
222 |
|
223 |
print(f"I: Video generated at {video_path}")
|
224 |
metrics_text += f"Video generated at {video_path}\n"
|
|
|
15 |
from vinorm import TTSnorm
|
16 |
from langchain.llms import HuggingFacePipeline
|
17 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
18 |
+
from diffusers import StableDiffusionPipeline
|
19 |
+
from PIL import Image
|
20 |
import cv2
|
21 |
from moviepy.editor import AudioFileClip, ImageSequenceClip
|
22 |
import gc
|
|
|
73 |
sum_llm_chain = tag_chain.chain(llm=local_llm)
|
74 |
pexels_api_key = os.getenv('pexels_api_key')
|
75 |
|
76 |
+
# Initialize Stable Diffusion Pipeline with TDN-M/East-asian-beauty
|
77 |
+
image_gen_model_id = "TDN-M/East-asian-beauty"
|
78 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
79 |
+
image_generator = StableDiffusionPipeline.from_pretrained(image_gen_model_id, torch_dtype=torch.float16)
|
80 |
+
image_generator = image_generator.to(device)
|
81 |
+
|
82 |
def normalize_vietnamese_text(text):
|
83 |
text = (
|
84 |
TTSnorm(text, unknown=False, lower=False, rule=True)
|
|
|
107 |
return 13000 * word_count + 2000 * num_punct
|
108 |
return -1
|
109 |
|
110 |
+
def create_video_from_audio_and_images(audio_path, images, output_path):
|
111 |
audio_clip = AudioFileClip(audio_path)
|
112 |
duration = audio_clip.duration
|
113 |
|
|
|
134 |
prompt = tokenizer.convert_tokens_to_string(tokens)
|
135 |
return prompt
|
136 |
|
137 |
+
def generate_images_from_sentences(sentences, image_generator, folder_path):
|
138 |
+
try:
|
139 |
+
for i, sentence in enumerate(sentences):
|
140 |
+
print(f"Generating image for sentence {i + 1}: {sentence}")
|
141 |
+
image = image_generator(sentence, guidance_scale=7.5).images[0]
|
142 |
+
image_path = os.path.join(folder_path, f"image_{i + 1}.png")
|
143 |
+
image.save(image_path)
|
144 |
+
print(f"Saved image at {image_path}")
|
145 |
+
except Exception as e:
|
146 |
+
print("Error! Failed generating images")
|
147 |
+
print(e)
|
148 |
+
return []
|
149 |
+
|
150 |
@spaces.GPU
|
151 |
def predict(
|
152 |
prompt,
|
|
|
226 |
out["wav"] = out["wav"][:keep_len]
|
227 |
torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
228 |
|
229 |
+
# Tạo video từ file audio và các cảnh
|
230 |
+
print("I: Generating images from sentences...")
|
231 |
# Sử dụng UUID để tạo tên thư mục ngắn gọn
|
232 |
folder_name = f"video_{uuid.uuid4().hex}"
|
233 |
os.makedirs(folder_name, exist_ok=True)
|
234 |
folder_path = os.path.join(folder_name, "images")
|
235 |
os.makedirs(folder_path, exist_ok=True)
|
236 |
|
237 |
+
# Tách các câu từ văn bản
|
238 |
+
sentences = [x.strip() for x in re.split(r'[.!?]', prompt) if len(x.strip()) > 6]
|
239 |
+
|
240 |
+
# Tạo ảnh minh họa cho từng câu
|
241 |
+
images = generate_images_from_sentences(sentences, image_generator, folder_path)
|
242 |
+
|
243 |
+
# Tạo video từ file audio và các ảnh
|
244 |
video_path = os.path.join(folder_name, "Final_Ad_Video.mp4")
|
245 |
+
create_video_from_audio_and_images("output.wav", images, video_path)
|
246 |
|
247 |
print(f"I: Video generated at {video_path}")
|
248 |
metrics_text += f"Video generated at {video_path}\n"
|