ZeeAI1 commited on
Commit
adbe522
·
verified ·
1 Parent(s): 2fde467

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +28 -0
  2. README.md +7 -5
  3. app.py +78 -0
  4. requirements.txt +8 -0
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set environment variables
4
+ ENV PIP_NO_CACHE_DIR=true \
5
+ PYTHONDONTWRITEBYTECODE=1 \
6
+ PYTHONUNBUFFERED=1
7
+
8
+ # Install system dependencies
9
+ RUN apt-get update && apt-get install -y \
10
+ ffmpeg \
11
+ libsndfile1 \
12
+ git \
13
+ && apt-get clean
14
+
15
+ # Set workdir
16
+ WORKDIR /app
17
+
18
+ # Copy files
19
+ COPY requirements.txt requirements.txt
20
+ COPY app.py app.py
21
+
22
+ # Install Python dependencies
23
+ RUN pip install --upgrade pip
24
+ RUN pip install numpy==1.24.3
25
+ RUN pip install -r requirements.txt
26
+
27
+ # Run the Streamlit app
28
+ CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
README.md CHANGED
@@ -1,10 +1,12 @@
1
  ---
2
- title: Video0003Docker
3
- emoji: 👀
4
- colorFrom: pink
5
- colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: AI Voiceover V2 (Docker)
3
+ emoji: 🧠
4
+ colorFrom: gray
5
+ colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
+ # 🧠 AI Voiceover V2 Replace One Speaker Only
11
+
12
+ Runs Whisper + Coqui TTS + audio mixing inside a Docker container for full compatibility and reliability.
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import whisper
3
+ from TTS.api import TTS
4
+ from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
5
+ import os
6
+ from tempfile import NamedTemporaryFile
7
+ import torchaudio
8
+
9
+ # Page config
10
+ st.set_page_config(page_title="AI Voiceover Generator V2", layout="centered")
11
+ st.title("🎤 AI Voiceover V2: Replace One Speaker Only")
12
+
13
+ # Load models
14
+ @st.cache_resource
15
+ def load_whisper_model():
16
+ return whisper.load_model("small")
17
+
18
+ @st.cache_resource
19
+ def load_tts_model():
20
+ return TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)
21
+
22
+ whisper_model = load_whisper_model()
23
+ tts = load_tts_model()
24
+
25
+ # Upload video
26
+ video_file = st.file_uploader("Upload a short video clip (MP4 preferred)", type=["mp4", "mov", "avi"])
27
+
28
+ if video_file:
29
+ with NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
30
+ tmp_video.write(video_file.read())
31
+ tmp_video_path = tmp_video.name
32
+
33
+ st.video(tmp_video_path)
34
+
35
+ # Extract audio
36
+ video = VideoFileClip(tmp_video_path)
37
+ audio_path = tmp_video_path.replace(".mp4", ".wav")
38
+ video.audio.write_audiofile(audio_path)
39
+
40
+ # Transcribe
41
+ st.info("Transcribing using Whisper...")
42
+ result = whisper_model.transcribe(audio_path)
43
+ st.subheader("📝 Detected Speech")
44
+ st.write(result["text"])
45
+
46
+ # Custom voiceover input
47
+ custom_text = st.text_area("Enter your custom voiceover text to replace one speaker:", result["text"])
48
+
49
+ if st.button("Replace Only One Speaker's Voice"):
50
+ # Generate new voiceover from custom text
51
+ ai_voice_path = audio_path.replace(".wav", "_ai_voice.wav")
52
+ tts.tts_to_file(text=custom_text, file_path=ai_voice_path)
53
+ st.audio(ai_voice_path)
54
+
55
+ # Load original audio
56
+ original_audio, sr = torchaudio.load(audio_path)
57
+ ai_audio, _ = torchaudio.load(ai_voice_path)
58
+
59
+ # Trim or pad AI voice to match duration (for demo purposes)
60
+ if ai_audio.shape[1] < original_audio.shape[1]:
61
+ diff = original_audio.shape[1] - ai_audio.shape[1]
62
+ ai_audio = torchaudio.functional.pad(ai_audio, (0, diff))
63
+ else:
64
+ ai_audio = ai_audio[:, :original_audio.shape[1]]
65
+
66
+ # Mix original and AI audio (simulating voice replacement, basic blend)
67
+ # NOTE: This does NOT perform speaker diarization — it's a placeholder
68
+ mixed_audio = (original_audio * 0.4) + (ai_audio * 0.6)
69
+ mixed_path = audio_path.replace(".wav", "_mixed.wav")
70
+ torchaudio.save(mixed_path, mixed_audio, sr)
71
+
72
+ # Final video
73
+ final_video = video.set_audio(AudioFileClip(mixed_path))
74
+ final_path = tmp_video_path.replace(".mp4", "_final_v2.mp4")
75
+ final_video.write_videofile(final_path, codec="libx264", audio_codec="aac")
76
+
77
+ with open(final_path, "rb") as f:
78
+ st.download_button(label="📥 Download Final Video with Mixed Voiceover", data=f, file_name="final_ai_video_v2.mp4")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/openai/whisper.git
2
+ git+https://github.com/coqui-ai/TTS.git
3
+ streamlit
4
+ moviepy
5
+ ffmpeg-python
6
+ torchaudio
7
+ torch
8
+ numba