ruslanmv commited on
Commit
c0408e6
·
1 Parent(s): 7702ee7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -267
app.py CHANGED
@@ -11,18 +11,6 @@ import shutil
11
  import numpy as np
12
  import random
13
  import spaces
14
- # Ensure `spaces` is imported first
15
- #try:
16
- # import spaces
17
- #except ImportError:
18
- # class spaces:
19
- # @staticmethod
20
- # def GPU(func=None, duration=None):
21
- # def wrapper(fn):
22
- # return fn
23
- # return wrapper if func is None else wrapper(func)
24
-
25
- # Now import CUDA-related libraries
26
  import torch
27
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
28
  from diffusers import DiffusionPipeline
@@ -34,10 +22,28 @@ from gtts import gTTS
34
  from pydub import AudioSegment
35
  import textwrap
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # Initialize FLUX pipeline only if CUDA is available
38
  dtype = torch.bfloat16
39
  device = "cuda" if torch.cuda.is_available() else "cpu"
40
-
41
  if device == "cuda":
42
  flux_pipe = DiffusionPipeline.from_pretrained(
43
  "black-forest-labs/FLUX.1-schnell",
@@ -53,7 +59,6 @@ nltk.download('punkt')
53
 
54
  # Ensure proper multiprocessing start method
55
  multiprocessing.set_start_method("spawn", force=True)
56
-
57
  # Download necessary NLTK data
58
  def setup_nltk():
59
  """Ensure required NLTK data is available."""
@@ -68,92 +73,15 @@ DESCRIPTION = (
68
  "PS: Generation of video by using Artificial Intelligence via FLUX, distilbart, and GTTS."
69
  )
70
  TITLE = "Video Story Generator with Audio by using FLUX, distilbart, and GTTS."
71
-
72
  # Load Tokenizer and Model for Text Summarization
73
- def load_text_summarization_model_V1():
74
- """Load the tokenizer and model for text summarization."""
75
- print("Loading text summarization model...")
76
- tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
77
- model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
78
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
79
- print(f"Using device: {device}")
80
- model.to(device)
81
- return tokenizer, model, device
82
-
83
  def load_text_summarization_model():
84
  """Load the tokenizer and model for text summarization on CPU."""
85
  print("Loading text summarization model...")
86
  tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
87
  model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
88
- # Remove the line that sets the device here
89
- # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
90
- # print(f"Using device: {device}")
91
- # model.to(device)
92
  return tokenizer, model
93
-
94
  tokenizer, model = load_text_summarization_model()
95
 
96
- tokenizer, model, device = load_text_summarization_model()
97
-
98
- # Log GPU Memory (optional, for debugging)
99
- def log_gpu_memory():
100
- """Log GPU memory usage."""
101
- if torch.cuda.is_available():
102
- print(subprocess.check_output('nvidia-smi').decode('utf-8'))
103
- else:
104
- print("CUDA is not available. Cannot log GPU memory.")
105
-
106
- # Check GPU Availability
107
- def check_gpu_availability():
108
- """Print GPU availability and device details."""
109
- if torch.cuda.is_available():
110
- print(f"CUDA devices: {torch.cuda.device_count()}")
111
- print(f"Current device: {torch.cuda.current_device()}")
112
- print(torch.cuda.get_device_properties(torch.cuda.current_device()))
113
- else:
114
- print("CUDA is not available. Running on CPU.")
115
-
116
- #check_gpu_availability()
117
-
118
- #@spaces.GPU()
119
- def generate_image_with_flux_old(
120
- text: str,
121
- seed: int = 42,
122
- width: int = 1024,
123
- height: int = 1024,
124
- num_inference_steps: int = 4,
125
- randomize_seed: bool = True
126
- ):
127
- """
128
- Generates an image from text using FLUX.
129
-
130
- Args:
131
- text: The text prompt to generate the image from.
132
- seed: The random seed for image generation. -1 for random.
133
- width: Width of the generated image.
134
- height: Height of the generated image.
135
- num_inference_steps: Number of inference steps.
136
- randomize_seed: Whether to randomize the seed.
137
-
138
- Returns:
139
- A PIL Image object.
140
- """
141
- print(f"DEBUG: Generating image with FLUX for text: '{text}'")
142
- if randomize_seed:
143
- seed = random.randint(0, MAX_SEED)
144
- generator = torch.Generator().manual_seed(seed)
145
- image = flux_pipe(
146
- prompt=text,
147
- width=width,
148
- height=height,
149
- num_inference_steps=num_inference_steps,
150
- generator=generator,
151
- guidance_scale=0.0
152
- ).images[0]
153
-
154
- print("DEBUG: Image generated successfully.")
155
- return image
156
-
157
 
158
  @spaces.GPU()
159
  def generate_image_with_flux(
@@ -218,183 +146,7 @@ def merge_audio_files(mp3_names: List[str]) -> str:
218
  print(f"DEBUG: Audio files merged and saved to {export_path}")
219
  return export_path
220
 
221
-
222
-
223
-
224
-
225
-
226
-
227
-
228
-
229
-
230
-
231
-
232
-
233
  # Function to generate video from text
234
- def get_output_video_old(text, seed, randomize_seed, width, height, num_inference_steps):
235
- print("DEBUG: Starting get_output_video function...")
236
-
237
- # Summarize the input text
238
- print("DEBUG: Summarizing text...")
239
- inputs = tokenizer(
240
- text,
241
- max_length=1024,
242
- truncation=True,
243
- return_tensors="pt"
244
- ).to(device)
245
- summary_ids = model.generate(inputs["input_ids"])
246
- summary = tokenizer.batch_decode(
247
- summary_ids,
248
- skip_special_tokens=True,
249
- clean_up_tokenization_spaces=False
250
- )
251
- plot = list(summary[0].split('.'))
252
- print(f"DEBUG: Summary generated: {plot}")
253
-
254
- image_system ="Generate a realistic picture about this: "
255
-
256
- # Generate images for each sentence in the plot
257
- generated_images = []
258
- for i, senten in enumerate(plot[:-1]):
259
- print(f"DEBUG: Generating image {i+1} of {len(plot)-1}...")
260
- image_dir = f"image_{i}"
261
- os.makedirs(image_dir, exist_ok=True)
262
- image = generate_image_with_flux(
263
- text= image_system + senten,
264
- seed=seed,
265
- randomize_seed=randomize_seed,
266
- width=width,
267
- height=height,
268
- num_inference_steps=num_inference_steps
269
- )
270
- generated_images.append(image)
271
- image_path = os.path.join(image_dir, "generated_image.png")
272
- image.save(image_path)
273
- print(f"DEBUG: Image generated and saved to {image_path}")
274
-
275
- #del min_dalle_model # No need to delete the model here
276
- # torch.cuda.empty_cache() # No need to empty cache here
277
- # gc.collect() # No need to collect garbage here
278
-
279
- # Create subtitles from the plot
280
- sentences = plot[:-1]
281
- print("DEBUG: Creating subtitles...")
282
- assert len(generated_images) == len(sentences), "Mismatch in number of images and sentences."
283
- sub_names = [nltk.tokenize.sent_tokenize(sentence) for sentence in sentences]
284
-
285
- # Add subtitles to images with dynamic adjustments
286
- def get_dynamic_wrap_width(font, text, image_width, padding):
287
- # Estimate the number of characters per line dynamically
288
- avg_char_width = sum(font.getbbox(c)[2] for c in text) / len(text)
289
- return max(1, (image_width - padding * 2) // avg_char_width)
290
-
291
- def draw_multiple_line_text(image, text, font, text_color, text_start_height, padding=10):
292
- draw = ImageDraw.Draw(image)
293
- image_width, _ = image.size
294
- y_text = text_start_height
295
- lines = textwrap.wrap(text, width=get_dynamic_wrap_width(font, text, image_width, padding))
296
- for line in lines:
297
- line_width, line_height = font.getbbox(line)[2:]
298
- draw.text(((image_width - line_width) / 2, y_text), line, font=font, fill=text_color)
299
- y_text += line_height + padding
300
-
301
- def add_text_to_img(text1, image_input):
302
- print(f"DEBUG: Adding text to image: '{text1}'")
303
- # Scale font size dynamically
304
- base_font_size = 30
305
- image_width, image_height = image_input.size
306
- scaled_font_size = max(10, int(base_font_size * (image_width / 800)))
307
- path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
308
- if not os.path.exists(path_font):
309
- path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
310
- font = ImageFont.truetype(path_font, scaled_font_size)
311
-
312
- text_color = (255, 255, 0)
313
- padding = 10
314
-
315
- # Estimate starting height dynamically
316
- line_height = font.getbbox("A")[3] + padding
317
- total_text_height = len(textwrap.wrap(text1, get_dynamic_wrap_width(font, text1, image_width, padding))) * line_height
318
- text_start_height = image_height - total_text_height - 20
319
-
320
- draw_multiple_line_text(image_input, text1, font, text_color, text_start_height, padding)
321
- return image_input
322
-
323
-
324
- # Process images with subtitles
325
- generated_images_sub = []
326
- for k, image in enumerate(generated_images):
327
- text_to_add = sub_names[k][0]
328
- result = add_text_to_img(text_to_add, image.copy())
329
- generated_images_sub.append(result)
330
- result.save(f"image_{k}/generated_image_with_subtitles.png")
331
-
332
-
333
-
334
- # Generate audio for each subtitle
335
- mp3_names = []
336
- mp3_lengths = []
337
- for k, text_to_add in enumerate(sub_names):
338
- print(f"DEBUG: Generating audio for: '{text_to_add[0]}'")
339
- f_name = f'audio_{k}.mp3'
340
- mp3_names.append(f_name)
341
- myobj = gTTS(text=text_to_add[0], lang='en', slow=False)
342
- myobj.save(f_name)
343
- audio = MP3(f_name)
344
- mp3_lengths.append(audio.info.length)
345
- print(f"DEBUG: Audio duration: {audio.info.length} seconds")
346
-
347
- # Merge audio files
348
- export_path = merge_audio_files(mp3_names)
349
-
350
- # Create video clips from images
351
- clips = []
352
- for k, img in enumerate(generated_images_sub):
353
- duration = mp3_lengths[k]
354
- print(f"DEBUG: Creating video clip {k+1} with duration: {duration} seconds")
355
- clip = mpy.ImageClip(f"image_{k}/generated_image_with_subtitles.png").set_duration(duration + 0.5)
356
- clips.append(clip)
357
-
358
- # Concatenate video clips
359
- print("DEBUG: Concatenating video clips...")
360
- concat_clip = mpy.concatenate_videoclips(clips, method="compose")
361
- concat_clip.write_videofile("result_no_audio.mp4", fps=24, logger=None)
362
-
363
- # Combine video and audio
364
- movie_name = 'result_no_audio.mp4'
365
- movie_final = 'result_final.mp4'
366
-
367
- def combine_audio(vidname, audname, outname, fps=24):
368
- print(f"DEBUG: Combining audio for video: '{vidname}'")
369
- my_clip = mpy.VideoFileClip(vidname)
370
- audio_background = mpy.AudioFileClip(audname)
371
- final_clip = my_clip.set_audio(audio_background)
372
- final_clip.write_videofile(outname, fps=fps, logger=None)
373
-
374
- combine_audio(movie_name, export_path, movie_final)
375
-
376
- # Clean up
377
- print("DEBUG: Cleaning up files...")
378
- for i in range(len(generated_images_sub)):
379
- shutil.rmtree(f"image_{i}")
380
- os.remove(f"audio_{i}.mp3")
381
- os.remove("result.mp3")
382
- os.remove("result_no_audio.mp4")
383
-
384
- print("DEBUG: Cleanup complete.")
385
- print("DEBUG: get_output_video function completed successfully.")
386
- return 'result_final.mp4'
387
-
388
-
389
-
390
-
391
-
392
- # Function to generate video from text
393
-
394
-
395
-
396
-
397
-
398
  @spaces.GPU()
399
  def get_output_video(text, seed, randomize_seed, width, height, num_inference_steps):
400
  print("DEBUG: Starting get_output_video function...")
 
11
  import numpy as np
12
  import random
13
  import spaces
 
 
 
 
 
 
 
 
 
 
 
 
14
  import torch
15
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
16
  from diffusers import DiffusionPipeline
 
22
  from pydub import AudioSegment
23
  import textwrap
24
 
25
+ # Log GPU Memory (optional, for debugging)
26
+ def log_gpu_memory():
27
+ """Log GPU memory usage."""
28
+ if torch.cuda.is_available():
29
+ print(subprocess.check_output('nvidia-smi').decode('utf-8'))
30
+ else:
31
+ print("CUDA is not available. Cannot log GPU memory.")
32
+
33
+ # Check GPU Availability
34
+ def check_gpu_availability():
35
+ """Print GPU availability and device details."""
36
+ if torch.cuda.is_available():
37
+ print(f"CUDA devices: {torch.cuda.device_count()}")
38
+ print(f"Current device: {torch.cuda.current_device()}")
39
+ print(torch.cuda.get_device_properties(torch.cuda.current_device()))
40
+ else:
41
+ print("CUDA is not available. Running on CPU.")
42
+
43
+ check_gpu_availability()
44
  # Initialize FLUX pipeline only if CUDA is available
45
  dtype = torch.bfloat16
46
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
47
  if device == "cuda":
48
  flux_pipe = DiffusionPipeline.from_pretrained(
49
  "black-forest-labs/FLUX.1-schnell",
 
59
 
60
  # Ensure proper multiprocessing start method
61
  multiprocessing.set_start_method("spawn", force=True)
 
62
  # Download necessary NLTK data
63
  def setup_nltk():
64
  """Ensure required NLTK data is available."""
 
73
  "PS: Generation of video by using Artificial Intelligence via FLUX, distilbart, and GTTS."
74
  )
75
  TITLE = "Video Story Generator with Audio by using FLUX, distilbart, and GTTS."
 
76
  # Load Tokenizer and Model for Text Summarization
 
 
 
 
 
 
 
 
 
 
77
  def load_text_summarization_model():
78
  """Load the tokenizer and model for text summarization on CPU."""
79
  print("Loading text summarization model...")
80
  tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
81
  model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
 
 
 
 
82
  return tokenizer, model
 
83
  tokenizer, model = load_text_summarization_model()
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  @spaces.GPU()
87
  def generate_image_with_flux(
 
146
  print(f"DEBUG: Audio files merged and saved to {export_path}")
147
  return export_path
148
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  # Function to generate video from text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  @spaces.GPU()
151
  def get_output_video(text, seed, randomize_seed, width, height, num_inference_steps):
152
  print("DEBUG: Starting get_output_video function...")