|
|
|
|
|
|
|
|
|
|
|
import os |
|
import time |
|
import re |
|
import google.generativeai as genai |
|
import requests |
|
import gradio as gr |
|
import tempfile |
|
from pydub import AudioSegment |
|
from moviepy.editor import VideoFileClip, AudioFileClip |
|
from google.generativeai.types import HarmCategory, HarmBlockThreshold |
|
|
|
|
|
|
|
|
|
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") |
|
TTS_API_URL = os.getenv("TTS_API_URL", "https://athspi-aitools-aittsg.hf.space/api/generate-tts/") |
|
|
|
|
|
if not GEMINI_API_KEY: |
|
raise ValueError("GEMINI_API_KEY secret not found! Please set it in your environment or Colab/Hugging Face secrets.") |
|
if not TTS_API_URL: |
|
raise ValueError("TTS_API_URL secret not found or empty! Please set it.") |
|
|
|
|
|
genai.configure(api_key=GEMINI_API_KEY) |
|
|
|
|
|
VOICE_CHOICES = { |
|
"Male (Charon)": "Charon", |
|
"Female (Zephyr)": "Zephyr" |
|
} |
|
|
|
|
|
GEMINI_PROMPT = """ |
|
You are an AI scriptwriter. Your task is to watch the provided video and transcribe ALL spoken dialogue into a SINGLE, CONTINUOUS block of modern, colloquial Tamil. |
|
|
|
**CRITICAL INSTRUCTIONS:** |
|
|
|
1. **Single Script:** Combine all dialogue from all speakers into one continuous script. The final output should be a single paragraph of text. |
|
2. **NO Timestamps:** Do NOT include any timestamps (e.g., [00:01 - 00:03]). |
|
3. **NO Speaker Labels:** Do NOT include any speaker labels or identifiers (e.g., ஆண்_1, பெண்_2). |
|
4. **Incorporate Performance:** To make the script expressive for a single narrator, add English style prompts and performance tags directly into the text. |
|
- Use style prompts like `Say happily:`, `Whisper mysteriously:`, `Shout angrily:`. |
|
- Use performance tags like `[laugh]`, `[singing]`, `[sigh]`. |
|
|
|
**EXAMPLE OUTPUT:** |
|
Say happily: வணக்கம்! [laugh] எப்படி இருக்கீங்க? Whisper mysteriously: அந்த ரகசியம் எனக்கு மட்டும் தான் தெரியும். Shout angrily: உடனே இங்கிருந்து போ! |
|
""" |
|
|
|
|
|
def upload_to_gemini(path, mime_type=None): |
|
"""Uploads the given file to Gemini.""" |
|
print(f"Uploading file: {path}") |
|
file = genai.upload_file(path, mime_type=mime_type) |
|
print(f"Uploaded file '{file.display_name}' as: {file.uri}") |
|
return file |
|
|
|
|
|
def wait_for_files_active(files): |
|
"""Waits for the given files to be active.""" |
|
print("Waiting for file processing...") |
|
for name in (file.name for file in files): |
|
file = genai.get_file(name) |
|
while file.state.name == "PROCESSING": |
|
print(".", end="", flush=True) |
|
time.sleep(10) |
|
file = genai.get_file(name) |
|
if file.state.name != "ACTIVE": |
|
raise Exception(f"File {file.name} failed to process") |
|
print("...all files ready") |
|
print() |
|
|
|
|
|
def generate_tamil_script(video_file_path): |
|
"""Generates a single, continuous Tamil script from the video.""" |
|
try: |
|
video_file = upload_to_gemini(video_file_path, mime_type="video/mp4") |
|
wait_for_files_active([video_file]) |
|
|
|
model = genai.GenerativeModel(model_name="models/gemini-1.5-pro-latest") |
|
print("Generating single narrator script...") |
|
response = model.generate_content( |
|
[GEMINI_PROMPT, video_file], |
|
request_options={"timeout": 1000}, |
|
safety_settings={ |
|
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, |
|
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, |
|
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, |
|
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, |
|
} |
|
) |
|
|
|
print(f"Deleting uploaded file from Gemini: {video_file.name}") |
|
genai.delete_file(video_file.name) |
|
|
|
if response.text: |
|
return " ".join(response.text.strip().splitlines()) |
|
else: |
|
return "Error: No valid script was generated by Gemini." |
|
except Exception as e: |
|
return f"Error in Gemini generation: {str(e)}" |
|
|
|
|
|
def generate_single_audio_track(dialogue_text, voice_name, is_cheerful, output_path): |
|
"""Generates one continuous audio track for the entire script.""" |
|
try: |
|
print(f"Generating single audio track with voice '{voice_name}' | Cheerful: {is_cheerful}") |
|
|
|
payload = { |
|
"text": dialogue_text, |
|
"voice_name": voice_name, |
|
"cheerful": is_cheerful |
|
} |
|
|
|
response = requests.post(TTS_API_URL, json=payload) |
|
|
|
if response.status_code == 200: |
|
with open(output_path, "wb") as f: |
|
f.write(response.content) |
|
print(f"Audio track saved successfully to {output_path}") |
|
return True |
|
else: |
|
print(f"Error from TTS API: {response.status_code} - {response.text}") |
|
return False |
|
|
|
except Exception as e: |
|
print(f"An error occurred in generate_single_audio_track: {str(e)}") |
|
return False |
|
|
|
|
|
def replace_video_audio(video_path, new_audio_path, output_path): |
|
"""Replaces the audio of a video with a new audio file.""" |
|
try: |
|
video_clip = VideoFileClip(video_path) |
|
audio_clip = AudioFileClip(new_audio_path) |
|
final_clip = video_clip.set_audio(audio_clip) |
|
final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac") |
|
video_clip.close() |
|
audio_clip.close() |
|
final_clip.close() |
|
return output_path |
|
except Exception as e: |
|
return f"Error replacing video audio: {str(e)}" |
|
|
|
|
|
def process_video_single_speaker(video_path, voice_choice, is_cheerful): |
|
"""Processes the video for single-speaker dubbing.""" |
|
if not video_path: |
|
return None, "Please upload a video file first." |
|
|
|
try: |
|
print("-" * 50) |
|
print(f"Starting single-speaker processing for: {video_path}") |
|
|
|
script = generate_tamil_script(video_path) |
|
if script.startswith("Error:"): |
|
return None, script |
|
print("\n--- Generated Script ---\n", script, "\n------------------------\n") |
|
|
|
voice_name = VOICE_CHOICES[voice_choice] |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio: |
|
temp_audio_path = temp_audio.name |
|
|
|
success = generate_single_audio_track(script, voice_name, is_cheerful, temp_audio_path) |
|
if not success: |
|
return None, "Failed to generate the audio track." |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_video: |
|
output_video_path = temp_video.name |
|
|
|
result = replace_video_audio(video_path, temp_audio_path, output_video_path) |
|
if isinstance(result, str) and result.startswith("Error:"): |
|
return None, result |
|
|
|
os.remove(temp_audio_path) |
|
|
|
print("Processing complete!") |
|
print("-" * 50) |
|
return result, script |
|
|
|
except Exception as e: |
|
return None, f"An unexpected error occurred: {str(e)}" |
|
|
|
|
|
def create_gradio_interface(): |
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
gr.Markdown("# AI Single-Speaker Video Dubbing") |
|
gr.Markdown("Upload a video and choose a voice. The AI will transcribe all speech into a single script and re-voice the entire video with the selected narrator.") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
video_input = gr.Video(label="Upload Video File") |
|
voice_selector = gr.Radio( |
|
list(VOICE_CHOICES.keys()), |
|
label="Select Narrator Voice", |
|
value="Male (Charon)" |
|
) |
|
cheerful_checkbox = gr.Checkbox(label="Enable Cheerful Tone", value=False) |
|
process_button = gr.Button("Generate Dubbed Video", variant="primary") |
|
|
|
with gr.Column(scale=1): |
|
video_output = gr.Video(label="Dubbed Video") |
|
dialogue_output = gr.Textbox(label="Generated Full Script", lines=15, interactive=False) |
|
|
|
process_button.click( |
|
fn=process_video_single_speaker, |
|
inputs=[video_input, voice_selector, cheerful_checkbox], |
|
outputs=[video_output, dialogue_output] |
|
) |
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
gradio_app = create_gradio_interface() |
|
gradio_app.launch(debug=True) |