WhisperCap / app.py
reab5555's picture
Update app.py
b6211a3 verified
raw
history blame
4.2 kB
import os
import math
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from moviepy.editor import VideoFileClip, concatenate_audioclips
def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=10, # Shorter chunk length to prevent overly long captions
batch_size=2,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
)
if video_file is None:
yield "Error: No video file provided.", None
return
video_path = video_file.name if hasattr(video_file, 'name') else video_file
try:
video = VideoFileClip(video_path)
except Exception as e:
yield f"Error processing video file: {str(e)}", None
return
audio = video.audio
duration = video.duration
n_chunks = math.ceil(duration / 10) # Split into 10-second chunks
transcription_txt = ""
transcription_srt = []
for i in range(n_chunks):
start = i * 10
end = min((i + 1) * 10, duration)
audio_chunk = audio.subclip(start, end)
temp_file_path = f"temp_audio_{i}.wav"
audio_chunk.write_audiofile(temp_file_path, codec='pcm_s16le')
with open(temp_file_path, "rb") as temp_file:
result = pipe(temp_file_path, generate_kwargs={"language": language})
transcription_txt += result["text"]
if transcribe_to_srt:
for chunk in result["chunks"]:
start_time, end_time = chunk["timestamp"]
if start_time is not None and end_time is not None:
transcription_srt.append({
"start": start_time + i * 10,
"end": end_time + i * 10,
"text": chunk["text"]
})
else:
print(f"Warning: Invalid timestamp for chunk: {chunk}")
os.remove(temp_file_path)
yield f"Progress: {int(((i + 1) / n_chunks) * 100)}%", None
output = ""
srt_file_path = None
if transcribe_to_text:
output += "Text Transcription:\n" + transcription_txt + "\n\n"
if transcribe_to_srt:
output += "SRT Transcription:\n"
srt_content = ""
for i, sub in enumerate(transcription_srt, 1):
srt_entry = f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
output += srt_entry
srt_content += srt_entry
srt_file_path = "transcription.srt"
with open(srt_file_path, "w", encoding="utf-8") as srt_file:
srt_file.write(srt_content)
output += f"\nSRT file saved as: {srt_file_path}"
yield output, srt_file_path
def format_time(seconds):
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Video(),
gr.Checkbox(label="Transcribe to Text"),
gr.Checkbox(label="Transcribe to SRT"),
gr.Dropdown(choices=['en', 'he', 'it', 'es', 'fr', 'de', 'zh', 'ar'], label="Language")
],
outputs=[
gr.Textbox(label="Transcription Output"),
gr.File(label="Download SRT")
],
title="WhisperCap Video Transcription",
description="Upload a video file to transcribe its audio using Whisper. You can download the SRT file if generated.",
)
iface.launch(share=True)