Spaces:

ZeeAI1
/

Video0003Docker

Sleeping

App Files Files Community

Video0003Docker / app.py

ZeeAI1

Update app.py

8be3cac verified 4 months ago

raw

history blame contribute delete

2.96 kB

	import streamlit as st
	import whisper
	from TTS.api import TTS
	from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
	import os
	from tempfile import NamedTemporaryFile
	import torchaudio

	# Page config
	st.set_page_config(page_title="AI Voiceover Generator V2", layout="centered")
	st.title("🎤 AI Voiceover V2: Replace One Speaker Only")

	# Load models
	@st.cache_resource
	def load_whisper_model():
	return whisper.load_model("small")

	@st.cache_resource
	def load_tts_model():
	return TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)

	whisper_model = load_whisper_model()
	tts = load_tts_model()

	# Upload video
	video_file = st.file_uploader("Upload a short video clip (MP4 preferred)", type=["mp4", "mov", "avi"])

	if video_file:
	with NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
	tmp_video.write(video_file.read())
	tmp_video_path = tmp_video.name

	st.video(tmp_video_path)

	# Extract audio
	video = VideoFileClip(tmp_video_path)
	audio_path = tmp_video_path.replace(".mp4", ".wav")
	video.audio.write_audiofile(audio_path)

	# Transcribe
	st.info("Transcribing using Whisper...")
	result = whisper_model.transcribe(audio_path)
	st.subheader("📝 Detected Speech")
	st.write(result["text"])

	# Custom voiceover input
	custom_text = st.text_area("Enter your custom voiceover text to replace one speaker:", result["text"])

	if st.button("Replace Only One Speaker's Voice"):
	# Generate new voiceover from custom text
	ai_voice_path = audio_path.replace(".wav", "_ai_voice.wav")
	tts.tts_to_file(text=custom_text, file_path=ai_voice_path)
	st.audio(ai_voice_path)

	# Load original audio
	original_audio, sr = torchaudio.load(audio_path)
	ai_audio, _ = torchaudio.load(ai_voice_path)

	# Trim or pad AI voice to match duration (for demo purposes)
	if ai_audio.shape[1] < original_audio.shape[1]:
	diff = original_audio.shape[1] - ai_audio.shape[1]
	ai_audio = torchaudio.functional.pad(ai_audio, (0, diff))
	else:
	ai_audio = ai_audio[:, :original_audio.shape[1]]

	# Mix original and AI audio (simulating voice replacement, basic blend)
	# NOTE: This does NOT perform speaker diarization — it's a placeholder
	mixed_audio = (original_audio * 0.4) + (ai_audio * 0.6)
	mixed_path = audio_path.replace(".wav", "_mixed.wav")
	torchaudio.save(mixed_path, mixed_audio, sr)

	# Final video
	final_video = video.set_audio(AudioFileClip(mixed_path))
	final_path = tmp_video_path.replace(".mp4", "_final_v2.mp4")
	final_video.write_videofile(final_path, codec="libx264", audio_codec="aac")

	with open(final_path, "rb") as f:
	st.download_button(label="📥 Download Final Video with Mixed Voiceover", data=f, file_name="final_ai_video_v2.mp4")