Spaces:

malvin-ai
/

light-ai-video-generator

Running on Zero

File size: 6,983 Bytes

#generate_subtitles.py

import random
import os
import torch
from moviepy import (
    VideoFileClip,
    TextClip,
    CompositeVideoClip,
    ImageClip,
    vfx
)
from moviepy.video.fx import FadeIn, Resize
import spaces



FONT_PATH = "DejaVuSans-Bold"



# Palette de couleurs « flashy »
SUBTITLE_COLORS = [
    "white", "yellow", "cyan", "deeppink", "gold", "lightgreen", "magenta", "orange"
]




def color_for_word(word: str) -> str:
    return random.choice(SUBTITLE_COLORS)





def chunk_text_by_words(segments, max_words=1):
    """

    Découpe chaque segment Whisper en mini sous-titres de max_words mots

    pour un affichage plus dynamique.

    """
    print("✂️ Découpage en sous-titres dynamiques (4 mots max)...")
    subs = []
    for seg in segments:
        words = seg['text'].strip().split()
        seg_duration = seg['end'] - seg['start']
        if not words or seg_duration <= 0:
            continue

        word_duration = seg_duration / len(words)

        for i in range(0, len(words), max_words):
            chunk_words = words[i:i + max_words]
            chunk_text = " ".join(chunk_words)
            start_time = seg['start'] + i * word_duration
            end_time = start_time + len(chunk_words) * word_duration

            subs.append({
                "start": start_time,
                "end": end_time,
                "text": chunk_text
            })

    print(f"🧩 {len(subs)} sous-titres créés (dynamiques).")
    return subs


def save_subtitles_to_srt(subtitles, output_path):
    """

    Sauvegarde les sous-titres au format .srt

    """
    def format_timestamp(seconds):
        h = int(seconds // 3600)
        m = int((seconds % 3600) // 60)
        s = int(seconds % 60)
        ms = int((seconds - int(seconds)) * 1000)
        return f"{h:02}:{m:02}:{s:02},{ms:03}"

    with open(output_path, "w", encoding="utf-8") as f:
        for i, sub in enumerate(subtitles, 1):
            f.write(f"{i}\n")
            f.write(f"{format_timestamp(sub['start'])} --> {format_timestamp(sub['end'])}\n")
            f.write(f"{sub['text'].strip()}\n\n")

def transcribe_audio_to_subs(audio_path):
    """

    Transcrit le fichier audio en texte (via Whisper), retourne la liste

    des segments start/end/text, et sauvegarde en .srt.

    """
    print("🎙️ Transcription avec Whisper...")

    # Empêche Torch de détecter CUDA
    import os
    os.environ["CUDA_VISIBLE_DEVICES"] = ""

    import whisper
    model = whisper.load_model("medium", device="cpu")
    result = model.transcribe(audio_path)

    subtitles = [{
        "start": seg['start'],
        "end": seg['end'],
        "text": seg['text']
    } for seg in result['segments']]

    print(f"📝 {len(subtitles)} sous-titres générés.")

    # Sauvegarde .srt
    base_name = os.path.splitext(audio_path)[0]
    srt_path = f"{base_name}.srt"
    save_subtitles_to_srt(subtitles, srt_path)
    print(f"💾 Sous-titres enregistrés dans : {srt_path}")

    return subtitles

def format_subtitle_text(text, max_chars=50):
    """

    Coupe le texte en 2 lignes max (~50 caractères max par ligne)

    pour mieux remplir la vidéo verticale sans déborder.

    """
    words = text.strip().split()
    lines = []
    current_line = ""

    for word in words:
        if len(current_line + " " + word) <= max_chars:
            current_line += (" " + word if current_line else word)
        else:
            lines.append(current_line.strip())
            current_line = word
    # Ajout de la dernière ligne
    lines.append(current_line.strip())

    # Retourne uniquement 2 lignes max
    return "\n".join(lines[:2])


def create_animated_subtitle_clip(text, start, end, video_w, video_h):
    """

    Crée un TextClip avec :

      - Couleur aléatoire

      - Fade-in / pop (resize progressif)

      - Position verticale fixe (ajustable) ou légèrement aléatoire

    """
    word = text.strip()
    color = color_for_word(word)


    # Mise en forme du texte

    # Création du clip texte de base
    txt_clip = TextClip(
        text=text,
        font=FONT_PATH,
        font_size=100,
        color=color,
        stroke_color="black",
        stroke_width=6,
        method="caption",
        size=(int(video_w * 0.8), None),  # 80% de la largeur, hauteur auto
        text_align="center",             # alignement dans la box
        horizontal_align="center",       # box centrée horizontalement
        vertical_align="center",         # box centrée verticalement
        interline=4,
        transparent=True
    )


    y_choices = [int(video_h * 0.45), int(video_h * 0.55), int(video_h * 0.6)]
    base_y = random.choice(y_choices)

    txt_clip = txt_clip.with_position(("center", base_y))
    txt_clip = txt_clip.with_start(start).with_end(end)

    # On applique un fadein + un petit effet "pop" qui grandit de 5% sur la durée du chunk
    # 1) fadein de 0.2s
    clip_fadein = FadeIn(duration=0.2).apply(txt_clip)

    # 2) agrandissement progressif (ex: 1.0 → 1.05 sur la durée)
    duration_subtitle = end - start
    def pop_effect(t):
        if duration_subtitle > 0:
            progress = t / duration_subtitle
            scale = 1.0 + 0.07 * (1 - (1 - progress) ** 3)  # easing out cubic
        else:
            scale = 1.0
        return scale

    resize_effect = Resize(pop_effect)
    clip_pop = resize_effect.apply(clip_fadein)  # ✅ Utilisation correcte



    return clip_pop


def add_subtitles_to_video(video_path, subtitles, output_file="./assets/output/video_with_subs.mp4"):
    """

    Insère les sous-titres animés/couleur dans la vidéo,

    recadre en 1080x1920 si besoin et exporte le résultat.

    """
    print("🎬 Insertion des sous-titres optimisés SHORTS...")

    video = VideoFileClip(video_path)

    # Force le format vertical 1080×1920 si non conforme
    if (video.w, video.h) != (1080, 1920):
        print("📐 Recadrage vidéo en 1080×1920...")
        video = video.resize((1080, 1920))

    clips = [video]

    for sub in subtitles:
        start_time = sub['start']
        end_time = sub['end']
        text_chunk = sub['text']

        animated_sub_clip = create_animated_subtitle_clip(
            text_chunk, start_time, end_time, video_w=video.w, video_h=video.h
        )
        clips.append(animated_sub_clip)


    final = CompositeVideoClip(clips, size=(1080, 1920)).with_duration(video.duration)

    # Export en MP4 H.264 + AAC, 30 fps
    final.write_videofile(
        output_file,
        codec="libx264",
        audio_codec="aac",
        fps=30,
        threads=4,
        preset="medium",
        ffmpeg_params=["-pix_fmt", "yuv420p"]
    )

    print(f"✅ Vidéo Shorts/TikTok prête : {output_file}")