WhisperCap / app.py
reab5555's picture
Create app.py
12f25c0 verified
raw
history blame
3.05 kB
import os
import math
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from moviepy.editor import AudioFileClip
def transcribe(audio_file, transcribe_to_text, transcribe_to_srt, language):
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=2,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
generate_kwargs={"language": language}
)
audio = AudioFileClip(audio_file.name)
duration = audio.duration
n_chunks = math.ceil(duration / 30)
transcription_txt = ""
transcription_srt = []
for i in range(n_chunks):
start = i * 30
end = min((i + 1) * 30, duration)
audio_chunk = audio.subclip(start, end)
temp_file_path = f"temp_audio_{i}.wav"
audio_chunk.write_audiofile(temp_file_path, codec='pcm_s16le')
with open(temp_file_path, "rb") as temp_file:
result = pipe(temp_file_path)
transcription_txt += result["text"]
if transcribe_to_srt:
for chunk in result["chunks"]:
start_time, end_time = chunk["timestamp"]
transcription_srt.append({
"start": start_time + i * 30,
"end": end_time + i * 30,
"text": chunk["text"]
})
os.remove(temp_file_path)
yield f"Progress: {int(((i + 1) / n_chunks) * 100)}%"
output = ""
if transcribe_to_text:
output += "Text Transcription:\n" + transcription_txt + "\n\n"
if transcribe_to_srt:
output += "SRT Transcription:\n"
for i, sub in enumerate(transcription_srt, 1):
output += f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
yield output
def format_time(seconds):
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(type="filepath"),
gr.Checkbox(label="Transcribe to Text"),
gr.Checkbox(label="Transcribe to SRT"),
gr.Dropdown(choices=['en', 'he', 'it', 'fr', 'de', 'zh', 'ar'], label="Language")
],
outputs="text",
title="WhisperCap Transcription",
description="Upload an audio file to transcribe it using Whisper.",
)
iface.launch()