import gradio as gr import os import subprocess from gtts import gTTS from pydub import AudioSegment def generate_video(image, text): # Save input image image_path = "input.jpg" image.save(image_path) # Create 5 sec mute video from image cmd = [ "ffmpeg", "-y", "-loop", "1", "-i", image_path, "-t", "5", "-vf", "fps=25,scale=512:512", "-c:v", "libx264", "-pix_fmt", "yuv420p", "input.mp4" ] subprocess.run(cmd, check=True) # Generate audio from text using gTTS tts = gTTS(text, lang="en") tts.save("tts.mp3") AudioSegment.from_mp3("tts.mp3").export("tts.wav", format="wav") # Run Wav2Lip inference cmd = [ "python", "Wav2Lip/inference.py", "--checkpoint_path", "checkpoints/wav2lip.pth", "--face", "input.mp4", "--audio", "tts.wav", "--outfile", "output.mp4" ] subprocess.run(cmd, check=True) return "output.mp4" # Build Gradio interface iface = gr.Interface( fn=generate_video, inputs=[ gr.Image(type="pil", label="Upload Cartoon Image"), gr.Textbox(lines=2, placeholder="Enter text for speech...", label="Text to Speak") ], outputs=gr.Video(label="Talking Cartoon Video"), title="Cartoon Talking Video Generator", description="Upload a cartoon image and enter text to generate a talking video using Wav2Lip." ) if __name__ == "__main__": iface.launch()