Spaces:

ruslanmv
/

TextToVideo-Flux

Paused

App Files Files Community

ruslanmv commited on Feb 2

Commit

c0408e6

1 Parent(s): 7702ee7

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -267

app.py CHANGED Viewed

@@ -11,18 +11,6 @@ import shutil
 import numpy as np
 import random
 import spaces
-# Ensure `spaces` is imported first
-#try:
-#    import spaces
-#except ImportError:
-#    class spaces:
-#        @staticmethod
-#        def GPU(func=None, duration=None):
-#            def wrapper(fn):
-#                return fn
-#            return wrapper if func is None else wrapper(func)
-# Now import CUDA-related libraries
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from diffusers import DiffusionPipeline
@@ -34,10 +22,28 @@ from gtts import gTTS
 from pydub import AudioSegment
 import textwrap
 # Initialize FLUX pipeline only if CUDA is available
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
 if device == "cuda":
     flux_pipe = DiffusionPipeline.from_pretrained(
         "black-forest-labs/FLUX.1-schnell",
@@ -53,7 +59,6 @@ nltk.download('punkt')
 # Ensure proper multiprocessing start method
 multiprocessing.set_start_method("spawn", force=True)
 # Download necessary NLTK data
 def setup_nltk():
     """Ensure required NLTK data is available."""
@@ -68,92 +73,15 @@ DESCRIPTION = (
     "PS: Generation of video by using Artificial Intelligence via FLUX, distilbart, and GTTS."
 )
 TITLE = "Video Story Generator with Audio by using FLUX, distilbart, and GTTS."
 # Load Tokenizer and Model for Text Summarization
-def load_text_summarization_model_V1():
-    """Load the tokenizer and model for text summarization."""
-    print("Loading text summarization model...")
-    tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
-    model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    print(f"Using device: {device}")
-    model.to(device)
-    return tokenizer, model, device
 def load_text_summarization_model():
     """Load the tokenizer and model for text summarization on CPU."""
     print("Loading text summarization model...")
     tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
     model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
-    # Remove the line that sets the device here
-    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    # print(f"Using device: {device}")
-    # model.to(device)
     return tokenizer, model
 tokenizer, model = load_text_summarization_model()
-tokenizer, model, device = load_text_summarization_model()
-# Log GPU Memory (optional, for debugging)
-def log_gpu_memory():
-    """Log GPU memory usage."""
-    if torch.cuda.is_available():
-        print(subprocess.check_output('nvidia-smi').decode('utf-8'))
-    else:
-        print("CUDA is not available. Cannot log GPU memory.")
-# Check GPU Availability
-def check_gpu_availability():
-    """Print GPU availability and device details."""
-    if torch.cuda.is_available():
-        print(f"CUDA devices: {torch.cuda.device_count()}")
-        print(f"Current device: {torch.cuda.current_device()}")
-        print(torch.cuda.get_device_properties(torch.cuda.current_device()))
-    else:
-        print("CUDA is not available. Running on CPU.")
-#check_gpu_availability()
-#@spaces.GPU()
-def generate_image_with_flux_old(
-    text: str,
-    seed: int = 42,
-    width: int = 1024,
-    height: int = 1024,
-    num_inference_steps: int = 4,
-    randomize_seed: bool = True
-):
-    """
-    Generates an image from text using FLUX.
-    Args:
-        text: The text prompt to generate the image from.
-        seed: The random seed for image generation. -1 for random.
-        width: Width of the generated image.
-        height: Height of the generated image.
-        num_inference_steps: Number of inference steps.
-        randomize_seed: Whether to randomize the seed.
-    Returns:
-        A PIL Image object.
-    """
-    print(f"DEBUG: Generating image with FLUX for text: '{text}'")
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = flux_pipe(
-        prompt=text,
-        width=width,
-        height=height,
-        num_inference_steps=num_inference_steps,
-        generator=generator,
-        guidance_scale=0.0
-    ).images[0]
-    print("DEBUG: Image generated successfully.")
-    return image
 @spaces.GPU()
 def generate_image_with_flux(
@@ -218,183 +146,7 @@ def merge_audio_files(mp3_names: List[str]) -> str:
     print(f"DEBUG: Audio files merged and saved to {export_path}")
     return export_path
 # Function to generate video from text
-def get_output_video_old(text, seed, randomize_seed, width, height, num_inference_steps):
-    print("DEBUG: Starting get_output_video function...")
-    # Summarize the input text
-    print("DEBUG: Summarizing text...")
-    inputs = tokenizer(
-        text,
-        max_length=1024,
-        truncation=True,
-        return_tensors="pt"
-    ).to(device)
-    summary_ids = model.generate(inputs["input_ids"])
-    summary = tokenizer.batch_decode(
-        summary_ids,
-        skip_special_tokens=True,
-        clean_up_tokenization_spaces=False
-    )
-    plot = list(summary[0].split('.'))
-    print(f"DEBUG: Summary generated: {plot}")
-    image_system ="Generate a realistic picture about this: "
-    # Generate images for each sentence in the plot
-    generated_images = []
-    for i, senten in enumerate(plot[:-1]):
-        print(f"DEBUG: Generating image {i+1} of {len(plot)-1}...")
-        image_dir = f"image_{i}"
-        os.makedirs(image_dir, exist_ok=True)
-        image = generate_image_with_flux(
-            text= image_system + senten,
-            seed=seed,
-            randomize_seed=randomize_seed,
-            width=width,
-            height=height,
-            num_inference_steps=num_inference_steps
-        )
-        generated_images.append(image)
-        image_path = os.path.join(image_dir, "generated_image.png")
-        image.save(image_path)
-        print(f"DEBUG: Image generated and saved to {image_path}")
-        #del min_dalle_model # No need to delete the model here
-        # torch.cuda.empty_cache() # No need to empty cache here
-        # gc.collect() # No need to collect garbage here
-    # Create subtitles from the plot
-    sentences = plot[:-1]
-    print("DEBUG: Creating subtitles...")
-    assert len(generated_images) == len(sentences), "Mismatch in number of images and sentences."
-    sub_names = [nltk.tokenize.sent_tokenize(sentence) for sentence in sentences]
-    # Add subtitles to images with dynamic adjustments
-    def get_dynamic_wrap_width(font, text, image_width, padding):
-        # Estimate the number of characters per line dynamically
-        avg_char_width = sum(font.getbbox(c)[2] for c in text) / len(text)
-        return max(1, (image_width - padding * 2) // avg_char_width)
-    def draw_multiple_line_text(image, text, font, text_color, text_start_height, padding=10):
-        draw = ImageDraw.Draw(image)
-        image_width, _ = image.size
-        y_text = text_start_height
-        lines = textwrap.wrap(text, width=get_dynamic_wrap_width(font, text, image_width, padding))
-        for line in lines:
-            line_width, line_height = font.getbbox(line)[2:]
-            draw.text(((image_width - line_width) / 2, y_text), line, font=font, fill=text_color)
-            y_text += line_height + padding
-    def add_text_to_img(text1, image_input):
-        print(f"DEBUG: Adding text to image: '{text1}'")
-        # Scale font size dynamically
-        base_font_size = 30
-        image_width, image_height = image_input.size
-        scaled_font_size = max(10, int(base_font_size * (image_width / 800)))
-        path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
-        if not os.path.exists(path_font):
-            path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
-        font = ImageFont.truetype(path_font, scaled_font_size)
-        text_color = (255, 255, 0)
-        padding = 10
-        # Estimate starting height dynamically
-        line_height = font.getbbox("A")[3] + padding
-        total_text_height = len(textwrap.wrap(text1, get_dynamic_wrap_width(font, text1, image_width, padding))) * line_height
-        text_start_height = image_height - total_text_height - 20
-        draw_multiple_line_text(image_input, text1, font, text_color, text_start_height, padding)
-        return image_input
-    # Process images with subtitles
-    generated_images_sub = []
-    for k, image in enumerate(generated_images):
-        text_to_add = sub_names[k][0]
-        result = add_text_to_img(text_to_add, image.copy())
-        generated_images_sub.append(result)
-        result.save(f"image_{k}/generated_image_with_subtitles.png")
-    # Generate audio for each subtitle
-    mp3_names = []
-    mp3_lengths = []
-    for k, text_to_add in enumerate(sub_names):
-        print(f"DEBUG: Generating audio for: '{text_to_add[0]}'")
-        f_name = f'audio_{k}.mp3'
-        mp3_names.append(f_name)
-        myobj = gTTS(text=text_to_add[0], lang='en', slow=False)
-        myobj.save(f_name)
-        audio = MP3(f_name)
-        mp3_lengths.append(audio.info.length)
-        print(f"DEBUG: Audio duration: {audio.info.length} seconds")
-    # Merge audio files
-    export_path = merge_audio_files(mp3_names)
-    # Create video clips from images
-    clips = []
-    for k, img in enumerate(generated_images_sub):
-        duration = mp3_lengths[k]
-        print(f"DEBUG: Creating video clip {k+1} with duration: {duration} seconds")
-        clip = mpy.ImageClip(f"image_{k}/generated_image_with_subtitles.png").set_duration(duration + 0.5)
-        clips.append(clip)
-    # Concatenate video clips
-    print("DEBUG: Concatenating video clips...")
-    concat_clip = mpy.concatenate_videoclips(clips, method="compose")
-    concat_clip.write_videofile("result_no_audio.mp4", fps=24, logger=None)
-    # Combine video and audio
-    movie_name = 'result_no_audio.mp4'
-    movie_final = 'result_final.mp4'
-    def combine_audio(vidname, audname, outname, fps=24):
-        print(f"DEBUG: Combining audio for video: '{vidname}'")
-        my_clip = mpy.VideoFileClip(vidname)
-        audio_background = mpy.AudioFileClip(audname)
-        final_clip = my_clip.set_audio(audio_background)
-        final_clip.write_videofile(outname, fps=fps, logger=None)
-    combine_audio(movie_name, export_path, movie_final)
-    # Clean up
-    print("DEBUG: Cleaning up files...")
-    for i in range(len(generated_images_sub)):
-        shutil.rmtree(f"image_{i}")
-        os.remove(f"audio_{i}.mp3")
-    os.remove("result.mp3")
-    os.remove("result_no_audio.mp4")
-    print("DEBUG: Cleanup complete.")
-    print("DEBUG: get_output_video function completed successfully.")
-    return 'result_final.mp4'
-# Function to generate video from text
 @spaces.GPU()
 def get_output_video(text, seed, randomize_seed, width, height, num_inference_steps):
     print("DEBUG: Starting get_output_video function...")

 import numpy as np
 import random
 import spaces
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from diffusers import DiffusionPipeline
 from pydub import AudioSegment
 import textwrap
+# Log GPU Memory (optional, for debugging)
+def log_gpu_memory():
+    """Log GPU memory usage."""
+    if torch.cuda.is_available():
+        print(subprocess.check_output('nvidia-smi').decode('utf-8'))
+    else:
+        print("CUDA is not available. Cannot log GPU memory.")
+# Check GPU Availability
+def check_gpu_availability():
+    """Print GPU availability and device details."""
+    if torch.cuda.is_available():
+        print(f"CUDA devices: {torch.cuda.device_count()}")
+        print(f"Current device: {torch.cuda.current_device()}")
+        print(torch.cuda.get_device_properties(torch.cuda.current_device()))
+    else:
+        print("CUDA is not available. Running on CPU.")
+check_gpu_availability()
 # Initialize FLUX pipeline only if CUDA is available
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
 if device == "cuda":
     flux_pipe = DiffusionPipeline.from_pretrained(
         "black-forest-labs/FLUX.1-schnell",
 # Ensure proper multiprocessing start method
 multiprocessing.set_start_method("spawn", force=True)
 # Download necessary NLTK data
 def setup_nltk():
     """Ensure required NLTK data is available."""
     "PS: Generation of video by using Artificial Intelligence via FLUX, distilbart, and GTTS."
 )
 TITLE = "Video Story Generator with Audio by using FLUX, distilbart, and GTTS."
 # Load Tokenizer and Model for Text Summarization
 def load_text_summarization_model():
     """Load the tokenizer and model for text summarization on CPU."""
     print("Loading text summarization model...")
     tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
     model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
     return tokenizer, model
 tokenizer, model = load_text_summarization_model()
 @spaces.GPU()
 def generate_image_with_flux(
     print(f"DEBUG: Audio files merged and saved to {export_path}")
     return export_path
 # Function to generate video from text
 @spaces.GPU()
 def get_output_video(text, seed, randomize_seed, width, height, num_inference_steps):
     print("DEBUG: Starting get_output_video function...")