ruslanmv commited on
Commit
6499e9c
·
1 Parent(s): 1b6fe15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +246 -4
app.py CHANGED
@@ -70,7 +70,7 @@ DESCRIPTION = (
70
  TITLE = "Video Story Generator with Audio by using FLUX, distilbart, and GTTS."
71
 
72
  # Load Tokenizer and Model for Text Summarization
73
- def load_text_summarization_model():
74
  """Load the tokenizer and model for text summarization."""
75
  print("Loading text summarization model...")
76
  tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
@@ -80,6 +80,19 @@ def load_text_summarization_model():
80
  model.to(device)
81
  return tokenizer, model, device
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  tokenizer, model, device = load_text_summarization_model()
84
 
85
  # Log GPU Memory (optional, for debugging)
@@ -102,8 +115,8 @@ def check_gpu_availability():
102
 
103
  check_gpu_availability()
104
 
105
- @spaces.GPU()
106
- def generate_image_with_flux(
107
  text: str,
108
  seed: int = 42,
109
  width: int = 1024,
@@ -141,6 +154,48 @@ def generate_image_with_flux(
141
  print("DEBUG: Image generated successfully.")
142
  return image
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  # --------- End of MinDalle Functions ---------
145
  # Merge audio files
146
 
@@ -165,8 +220,18 @@ def merge_audio_files(mp3_names: List[str]) -> str:
165
 
166
 
167
 
 
 
 
 
 
 
 
 
 
 
168
  # Function to generate video from text
169
- def get_output_video(text, seed, randomize_seed, width, height, num_inference_steps):
170
  print("DEBUG: Starting get_output_video function...")
171
 
172
  # Summarize the input text
@@ -320,6 +385,183 @@ def get_output_video(text, seed, randomize_seed, width, height, num_inference_st
320
  print("DEBUG: get_output_video function completed successfully.")
321
  return 'result_final.mp4'
322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  # Example text (can be changed by user in Gradio interface)
324
  text = 'Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.'
325
 
 
70
  TITLE = "Video Story Generator with Audio by using FLUX, distilbart, and GTTS."
71
 
72
  # Load Tokenizer and Model for Text Summarization
73
+ def load_text_summarization_model_V1():
74
  """Load the tokenizer and model for text summarization."""
75
  print("Loading text summarization model...")
76
  tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
 
80
  model.to(device)
81
  return tokenizer, model, device
82
 
83
+ def load_text_summarization_model():
84
+ """Load the tokenizer and model for text summarization on CPU."""
85
+ print("Loading text summarization model...")
86
+ tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
87
+ model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
88
+ # Remove the line that sets the device here
89
+ # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
90
+ # print(f"Using device: {device}")
91
+ # model.to(device)
92
+ return tokenizer, model
93
+
94
+ tokenizer, model = load_text_summarization_model()
95
+
96
  tokenizer, model, device = load_text_summarization_model()
97
 
98
  # Log GPU Memory (optional, for debugging)
 
115
 
116
  check_gpu_availability()
117
 
118
+ #@spaces.GPU()
119
+ def generate_image_with_flux_old(
120
  text: str,
121
  seed: int = 42,
122
  width: int = 1024,
 
154
  print("DEBUG: Image generated successfully.")
155
  return image
156
 
157
+
158
+ @spaces.GPU()
159
+ def generate_image_with_flux(
160
+ text: str,
161
+ seed: int = 42,
162
+ width: int = 1024,
163
+ height: int = 1024,
164
+ num_inference_steps: int = 4,
165
+ randomize_seed: bool = True):
166
+ """
167
+ Generates an image from text using FLUX.
168
+ Args:
169
+ text: The text prompt to generate the image from.
170
+ seed: The random seed for image generation. -1 for random.
171
+ width: Width of the generated image.
172
+ height: Height of the generated image.
173
+ num_inference_steps: Number of inference steps.
174
+ randomize_seed: Whether to randomize the seed.
175
+ Returns:
176
+ A PIL Image object.
177
+ """
178
+ print(f"DEBUG: Generating image with FLUX for text: '{text}'")
179
+
180
+ # Initialize FLUX pipeline here
181
+ dtype = torch.bfloat16
182
+ device = "cuda" if torch.cuda.is_available() else "cpu"
183
+ flux_pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=dtype).to(device)
184
+
185
+ if randomize_seed:
186
+ seed = random.randint(0, MAX_SEED)
187
+ generator = torch.Generator(device=device).manual_seed(seed) # Specify device for generator
188
+ image = flux_pipe(
189
+ prompt=text,
190
+ width=width,
191
+ height=height,
192
+ num_inference_steps=num_inference_steps,
193
+ generator=generator,
194
+ guidance_scale=0.0
195
+ ).images[0]
196
+ print("DEBUG: Image generated successfully.")
197
+ return image
198
+
199
  # --------- End of MinDalle Functions ---------
200
  # Merge audio files
201
 
 
220
 
221
 
222
 
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
  # Function to generate video from text
234
+ def get_output_video_old(text, seed, randomize_seed, width, height, num_inference_steps):
235
  print("DEBUG: Starting get_output_video function...")
236
 
237
  # Summarize the input text
 
385
  print("DEBUG: get_output_video function completed successfully.")
386
  return 'result_final.mp4'
387
 
388
+
389
+
390
+
391
+
392
+ # Function to generate video from text
393
+
394
+
395
+
396
+
397
+
398
+ @spaces.GPU()
399
+ def get_output_video(text, seed, randomize_seed, width, height, num_inference_steps):
400
+ print("DEBUG: Starting get_output_video function...")
401
+
402
+ # Set the device here, inside the GPU-accelerated function
403
+ device = "cuda" if torch.cuda.is_available() else "cpu"
404
+
405
+ # Move the model to the GPU
406
+ model.to(device)
407
+
408
+ # Summarize the input text
409
+ print("DEBUG: Summarizing text...")
410
+ inputs = tokenizer(
411
+ text,
412
+ max_length=1024,
413
+ truncation=True,
414
+ return_tensors="pt"
415
+ ).to(device) # Now it's safe to move to the device
416
+ summary_ids = model.generate(inputs["input_ids"].to(device)) # .to(device) here
417
+ summary = tokenizer.batch_decode(
418
+ summary_ids,
419
+ skip_special_tokens=True,
420
+ clean_up_tokenization_spaces=False
421
+ )
422
+ plot = list(summary[0].split('.'))
423
+ print(f"DEBUG: Summary generated: {plot}")
424
+
425
+ image_system ="Generate a realistic picture about this: "
426
+
427
+ # Generate images for each sentence in the plot
428
+ generated_images = []
429
+ for i, senten in enumerate(plot[:-1]):
430
+ print(f"DEBUG: Generating image {i+1} of {len(plot)-1}...")
431
+ image_dir = f"image_{i}"
432
+ os.makedirs(image_dir, exist_ok=True)
433
+ image = generate_image_with_flux(
434
+ text= image_system + senten,
435
+ seed=seed,
436
+ randomize_seed=randomize_seed,
437
+ width=width,
438
+ height=height,
439
+ num_inference_steps=num_inference_steps
440
+ )
441
+ generated_images.append(image)
442
+ image_path = os.path.join(image_dir, "generated_image.png")
443
+ image.save(image_path)
444
+ print(f"DEBUG: Image generated and saved to {image_path}")
445
+
446
+ #del min_dalle_model # No need to delete the model here
447
+ # torch.cuda.empty_cache() # No need to empty cache here
448
+ # gc.collect() # No need to collect garbage here
449
+
450
+ # Create subtitles from the plot
451
+ sentences = plot[:-1]
452
+ print("DEBUG: Creating subtitles...")
453
+ assert len(generated_images) == len(sentences), "Mismatch in number of images and sentences."
454
+ sub_names = [nltk.tokenize.sent_tokenize(sentence) for sentence in sentences]
455
+
456
+ # Add subtitles to images with dynamic adjustments
457
+ def get_dynamic_wrap_width(font, text, image_width, padding):
458
+ # Estimate the number of characters per line dynamically
459
+ avg_char_width = sum(font.getbbox(c)[2] for c in text) / len(text)
460
+ return max(1, (image_width - padding * 2) // avg_char_width)
461
+
462
+ def draw_multiple_line_text(image, text, font, text_color, text_start_height, padding=10):
463
+ draw = ImageDraw.Draw(image)
464
+ image_width, _ = image.size
465
+ y_text = text_start_height
466
+ lines = textwrap.wrap(text, width=get_dynamic_wrap_width(font, text, image_width, padding))
467
+ for line in lines:
468
+ line_width, line_height = font.getbbox(line)[2:]
469
+ draw.text(((image_width - line_width) / 2, y_text), line, font=font, fill=text_color)
470
+ y_text += line_height + padding
471
+
472
+ def add_text_to_img(text1, image_input):
473
+ print(f"DEBUG: Adding text to image: '{text1}'")
474
+ # Scale font size dynamically
475
+ base_font_size = 30
476
+ image_width, image_height = image_input.size
477
+ scaled_font_size = max(10, int(base_font_size * (image_width / 800)))
478
+ path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
479
+ if not os.path.exists(path_font):
480
+ path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
481
+ font = ImageFont.truetype(path_font, scaled_font_size)
482
+
483
+ text_color = (255, 255, 0)
484
+ padding = 10
485
+
486
+ # Estimate starting height dynamically
487
+ line_height = font.getbbox("A")[3] + padding
488
+ total_text_height = len(textwrap.wrap(text1, get_dynamic_wrap_width(font, text1, image_width, padding))) * line_height
489
+ text_start_height = image_height - total_text_height - 20
490
+
491
+ draw_multiple_line_text(image_input, text1, font, text_color, text_start_height, padding)
492
+ return image_input
493
+
494
+
495
+ # Process images with subtitles
496
+ generated_images_sub = []
497
+ for k, image in enumerate(generated_images):
498
+ text_to_add = sub_names[k][0]
499
+ result = add_text_to_img(text_to_add, image.copy())
500
+ generated_images_sub.append(result)
501
+ result.save(f"image_{k}/generated_image_with_subtitles.png")
502
+
503
+
504
+
505
+ # Generate audio for each subtitle
506
+ mp3_names = []
507
+ mp3_lengths = []
508
+ for k, text_to_add in enumerate(sub_names):
509
+ print(f"DEBUG: Generating audio for: '{text_to_add[0]}'")
510
+ f_name = f'audio_{k}.mp3'
511
+ mp3_names.append(f_name)
512
+ myobj = gTTS(text=text_to_add[0], lang='en', slow=False)
513
+ myobj.save(f_name)
514
+ audio = MP3(f_name)
515
+ mp3_lengths.append(audio.info.length)
516
+ print(f"DEBUG: Audio duration: {audio.info.length} seconds")
517
+
518
+ # Merge audio files
519
+ export_path = merge_audio_files(mp3_names)
520
+
521
+ # Create video clips from images
522
+ clips = []
523
+ for k, img in enumerate(generated_images_sub):
524
+ duration = mp3_lengths[k]
525
+ print(f"DEBUG: Creating video clip {k+1} with duration: {duration} seconds")
526
+ clip = mpy.ImageClip(f"image_{k}/generated_image_with_subtitles.png").set_duration(duration + 0.5)
527
+ clips.append(clip)
528
+
529
+ # Concatenate video clips
530
+ print("DEBUG: Concatenating video clips...")
531
+ concat_clip = mpy.concatenate_videoclips(clips, method="compose")
532
+ concat_clip.write_videofile("result_no_audio.mp4", fps=24, logger=None)
533
+
534
+ # Combine video and audio
535
+ movie_name = 'result_no_audio.mp4'
536
+ movie_final = 'result_final.mp4'
537
+
538
+ def combine_audio(vidname, audname, outname, fps=24):
539
+ print(f"DEBUG: Combining audio for video: '{vidname}'")
540
+ my_clip = mpy.VideoFileClip(vidname)
541
+ audio_background = mpy.AudioFileClip(audname)
542
+ final_clip = my_clip.set_audio(audio_background)
543
+ final_clip.write_videofile(outname, fps=fps, logger=None)
544
+
545
+ combine_audio(movie_name, export_path, movie_final)
546
+
547
+ # Clean up
548
+ print("DEBUG: Cleaning up files...")
549
+ for i in range(len(generated_images_sub)):
550
+ shutil.rmtree(f"image_{i}")
551
+ os.remove(f"audio_{i}.mp3")
552
+ os.remove("result.mp3")
553
+ os.remove("result_no_audio.mp4")
554
+
555
+ print("DEBUG: Cleanup complete.")
556
+ print("DEBUG: get_output_video function completed successfully.")
557
+ return 'result_final.mp4'
558
+
559
+
560
+
561
+
562
+
563
+
564
+
565
  # Example text (can be changed by user in Gradio interface)
566
  text = 'Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.'
567