Video0003Docker / app.py
ZeeAI1's picture
Upload 4 files
adbe522 verified
raw
history blame
2.96 kB
import streamlit as st
import whisper
from TTS.api import TTS
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
import os
from tempfile import NamedTemporaryFile
import torchaudio
# Page config
st.set_page_config(page_title="AI Voiceover Generator V2", layout="centered")
st.title("🎀 AI Voiceover V2: Replace One Speaker Only")
# Load models
@st.cache_resource
def load_whisper_model():
return whisper.load_model("small")
@st.cache_resource
def load_tts_model():
return TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)
whisper_model = load_whisper_model()
tts = load_tts_model()
# Upload video
video_file = st.file_uploader("Upload a short video clip (MP4 preferred)", type=["mp4", "mov", "avi"])
if video_file:
with NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
tmp_video.write(video_file.read())
tmp_video_path = tmp_video.name
st.video(tmp_video_path)
# Extract audio
video = VideoFileClip(tmp_video_path)
audio_path = tmp_video_path.replace(".mp4", ".wav")
video.audio.write_audiofile(audio_path)
# Transcribe
st.info("Transcribing using Whisper...")
result = whisper_model.transcribe(audio_path)
st.subheader("πŸ“ Detected Speech")
st.write(result["text"])
# Custom voiceover input
custom_text = st.text_area("Enter your custom voiceover text to replace one speaker:", result["text"])
if st.button("Replace Only One Speaker's Voice"):
# Generate new voiceover from custom text
ai_voice_path = audio_path.replace(".wav", "_ai_voice.wav")
tts.tts_to_file(text=custom_text, file_path=ai_voice_path)
st.audio(ai_voice_path)
# Load original audio
original_audio, sr = torchaudio.load(audio_path)
ai_audio, _ = torchaudio.load(ai_voice_path)
# Trim or pad AI voice to match duration (for demo purposes)
if ai_audio.shape[1] < original_audio.shape[1]:
diff = original_audio.shape[1] - ai_audio.shape[1]
ai_audio = torchaudio.functional.pad(ai_audio, (0, diff))
else:
ai_audio = ai_audio[:, :original_audio.shape[1]]
# Mix original and AI audio (simulating voice replacement, basic blend)
# NOTE: This does NOT perform speaker diarization β€” it's a placeholder
mixed_audio = (original_audio * 0.4) + (ai_audio * 0.6)
mixed_path = audio_path.replace(".wav", "_mixed.wav")
torchaudio.save(mixed_path, mixed_audio, sr)
# Final video
final_video = video.set_audio(AudioFileClip(mixed_path))
final_path = tmp_video_path.replace(".mp4", "_final_v2.mp4")
final_video.write_videofile(final_path, codec="libx264", audio_codec="aac")
with open(final_path, "rb") as f:
st.download_button(label="πŸ“₯ Download Final Video with Mixed Voiceover", data=f, file_name="final_ai_video_v2.mp4")