Spaces:

ruslanmv
/

TextToVideo-Dalle

Paused

App Files Files Community

ruslanmv commited on Jan 18

Commit

18c8652

verified ·

1 Parent(s): e3c35a3

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -18

app.py CHANGED Viewed

@@ -18,6 +18,8 @@ import shutil
 import matplotlib.pyplot as plt
 import gc  # Import the garbage collector
 from audio import *
 # Download necessary NLTK data
 try:
     nltk.data.find('tokenizers/punkt')
@@ -70,8 +72,6 @@ def load_min_dalle_model(models_root: str = 'pretrained', fp16: bool = True):
 # Initialize the MinDalle model
 min_dalle_model = load_min_dalle_model()
 def generate_image_with_min_dalle(
     model: MinDalle,
     text: str,
@@ -114,10 +114,6 @@ def generate_image_with_min_dalle(
 from pydub import AudioSegment
 import os
 # Function to generate video from text
 def get_output_video(text):
     print("DEBUG: Starting get_output_video function...")
@@ -169,31 +165,46 @@ def get_output_video(text):
     assert len(generated_images) == len(sentences), "Mismatch in number of images and sentences."
     sub_names = [nltk.tokenize.sent_tokenize(sentence) for sentence in sentences]
-    # Add subtitles to images
-    def draw_multiple_line_text(image, text, font, text_color, text_start_height):
         draw = ImageDraw.Draw(image)
-        image_width, image_height = image.size
         y_text = text_start_height
-        lines = textwrap.wrap(text, width=40)
         for line in lines:
             line_width, line_height = font.getbbox(line)[2:]
-            draw.text(((image_width - line_width) / 2, y_text),
-                      line, font=font, fill=text_color)
-            y_text += line_height
     def add_text_to_img(text1, image_input):
         print(f"DEBUG: Adding text to image: '{text1}'")
-        fontsize = 30
         path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
         if not os.path.exists(path_font):
             path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
-        font = ImageFont.truetype(path_font, fontsize)
         text_color = (255, 255, 0)
-        text_start_height = image_input.height - (fontsize * len(textwrap.wrap(text1, width=40))) - 20
-        draw_multiple_line_text(image_input, text1, font, text_color, text_start_height)
         return image_input
     generated_images_sub = []
     for k, image in enumerate(generated_images):
         text_to_add = sub_names[k][0]
@@ -201,6 +212,8 @@ def get_output_video(text):
         generated_images_sub.append(result)
         result.save(f"image_{k}/generated_image_with_subtitles.png")
     # Generate audio for each subtitle
     mp3_names = []
     mp3_lengths = []
@@ -277,4 +290,4 @@ with demo:
     button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
 # Launch the Gradio app
-demo.launch(debug=True, share=False)

 import matplotlib.pyplot as plt
 import gc  # Import the garbage collector
 from audio import *
+import os
 # Download necessary NLTK data
 try:
     nltk.data.find('tokenizers/punkt')
 # Initialize the MinDalle model
 min_dalle_model = load_min_dalle_model()
 def generate_image_with_min_dalle(
     model: MinDalle,
     text: str,
 from pydub import AudioSegment
 import os
 # Function to generate video from text
 def get_output_video(text):
     print("DEBUG: Starting get_output_video function...")
     assert len(generated_images) == len(sentences), "Mismatch in number of images and sentences."
     sub_names = [nltk.tokenize.sent_tokenize(sentence) for sentence in sentences]
+    # Add subtitles to images with dynamic adjustments
+    def get_dynamic_wrap_width(font, text, image_width, padding):
+        # Estimate the number of characters per line dynamically
+        avg_char_width = sum(font.getbbox(c)[2] for c in text) / len(text)
+        return max(1, (image_width - padding * 2) // avg_char_width)
+    def draw_multiple_line_text(image, text, font, text_color, text_start_height, padding=10):
         draw = ImageDraw.Draw(image)
+        image_width, _ = image.size
         y_text = text_start_height
+        lines = textwrap.wrap(text, width=get_dynamic_wrap_width(font, text, image_width, padding))
         for line in lines:
             line_width, line_height = font.getbbox(line)[2:]
+            draw.text(((image_width - line_width) / 2, y_text), line, font=font, fill=text_color)
+            y_text += line_height + padding
     def add_text_to_img(text1, image_input):
         print(f"DEBUG: Adding text to image: '{text1}'")
+        # Scale font size dynamically
+        base_font_size = 30
+        image_width, image_height = image_input.size
+        scaled_font_size = max(10, int(base_font_size * (image_width / 800)))  # Adjust 800 based on typical image width
         path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
         if not os.path.exists(path_font):
             path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
+        font = ImageFont.truetype(path_font, scaled_font_size)
         text_color = (255, 255, 0)
+        padding = 10
+        # Estimate starting height dynamically
+        line_height = font.getbbox("A")[3] + padding
+        total_text_height = len(textwrap.wrap(text1, get_dynamic_wrap_width(font, text1, image_width, padding))) * line_height
+        text_start_height = image_height - total_text_height - 20
+        draw_multiple_line_text(image_input, text1, font, text_color, text_start_height, padding)
         return image_input
+    # Process images with subtitles
     generated_images_sub = []
     for k, image in enumerate(generated_images):
         text_to_add = sub_names[k][0]
         generated_images_sub.append(result)
         result.save(f"image_{k}/generated_image_with_subtitles.png")
     # Generate audio for each subtitle
     mp3_names = []
     mp3_lengths = []
     button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
 # Launch the Gradio app
+demo.launch(debug=True, share=True)