File size: 3,110 Bytes
5fd1d62 7a98cb1 5fd1d62 7aa414b 5fd1d62 7aa414b 5fd1d62 7aa414b 5fd1d62 7aa414b f4b4d99 5fd1d62 7aa414b f4b4d99 5fd1d62 f4b4d99 7aa414b 5fd1d62 7a98cb1 7aa414b 7a98cb1 5fd1d62 7aa414b 5fd1d62 7a98cb1 7aa414b 5fd1d62 7aa414b 5fd1d62 7aa414b 5fd1d62 7aa414b 7a98cb1 5fd1d62 bfe569d 5fd1d62 7a98cb1 5fd1d62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import gradio as gr
import torch
import whisper
from moviepy.editor import AudioFileClip, ColorClip, concatenate_videoclips
from moviepy.video.VideoClip import TextClip
def generate_video(audio_path, language, lag, progress=gr.Progress(track_tqdm=True)):
# Transcribe audio
progress(0.0, "Transcribing audio...")
result = model.transcribe(audio_path, language=language)
progress(0.30, "Audio transcribed!")
# Prepare video clips from transcription segments
clips = []
total_segments = len(result["segments"])
running_progress = 0.0
current_time = 0.0
for segment in result["segments"]:
running_progress += 0.4 / total_segments
if segment["start"] > current_time:
clips.append(
ColorClip((1280, 720), color=(0, 0, 0)).set_duration(
segment["start"] - current_time
)
)
text_clip = (
TextClip(
segment["text"],
fontsize=24,
font="Arial",
color="white",
bg_color="black",
size=(1280, 720),
)
.set_duration(segment["end"] - segment["start"])
.set_start(segment["start"])
)
clips.append(text_clip)
current_time = segment["end"]
progress(min(0.3 + running_progress, 0.7), "Generating video frames...")
if lag > 0:
clips.insert(0, ColorClip((1280, 720), color=(0, 0, 0)).set_duration(lag))
progress(0.7, "Video frames generated!")
# Concatenate clips and set audio
progress(0.75, "Concatenating video clips...")
video = concatenate_videoclips(clips, method="compose")
# Add audio to the video
progress(0.85, "Adding audio to video...")
video = video.set_audio(AudioFileClip(audio_path))
# Export video to a buffer
progress(0.90, "Exporting video...")
output_path = "./transcribed_video.mp4"
video.write_videofile(output_path, fps=6, codec="libx264", audio_codec="aac")
progress(1.0, "Video exported!")
return output_path
if __name__ == "__main__":
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("base", device=DEVICE)
# Gradio interface
iface = gr.Interface(
fn=generate_video,
inputs=[
gr.Audio(
sources=["upload", "microphone"], type="filepath", label="Audio File"
),
gr.Dropdown(
["en", "es", "fr", "de", "it", "nl", "ru", "zh"],
label="Language",
value="en",
),
gr.Slider(
minimum=0,
maximum=10,
step=1,
value=0,
label="Lag (seconds): delay the transcription by this amount of time.",
),
],
outputs=gr.Video(label="Play Video", show_download_button=True),
title="Audio Transcription Video Generator",
description="Upload your audio file and select the language for transcription.",
)
iface.launch()
|