Spaces:
Sleeping
Sleeping
import os | |
import math | |
import gradio as gr | |
import torch | |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
from moviepy.editor import AudioFileClip | |
def transcribe(audio_file, transcribe_to_text, transcribe_to_srt, language): | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
model_id = "openai/whisper-large-v3" | |
model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True | |
) | |
model.to(device) | |
processor = AutoProcessor.from_pretrained(model_id) | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model=model, | |
tokenizer=processor.tokenizer, | |
feature_extractor=processor.feature_extractor, | |
max_new_tokens=128, | |
chunk_length_s=30, | |
batch_size=2, | |
return_timestamps=True, | |
torch_dtype=torch_dtype, | |
device=device, | |
generate_kwargs={"language": language} | |
) | |
audio = AudioFileClip(audio_file.name) | |
duration = audio.duration | |
n_chunks = math.ceil(duration / 30) | |
transcription_txt = "" | |
transcription_srt = [] | |
for i in range(n_chunks): | |
start = i * 30 | |
end = min((i + 1) * 30, duration) | |
audio_chunk = audio.subclip(start, end) | |
temp_file_path = f"temp_audio_{i}.wav" | |
audio_chunk.write_audiofile(temp_file_path, codec='pcm_s16le') | |
with open(temp_file_path, "rb") as temp_file: | |
result = pipe(temp_file_path) | |
transcription_txt += result["text"] | |
if transcribe_to_srt: | |
for chunk in result["chunks"]: | |
start_time, end_time = chunk["timestamp"] | |
transcription_srt.append({ | |
"start": start_time + i * 30, | |
"end": end_time + i * 30, | |
"text": chunk["text"] | |
}) | |
os.remove(temp_file_path) | |
yield f"Progress: {int(((i + 1) / n_chunks) * 100)}%" | |
output = "" | |
if transcribe_to_text: | |
output += "Text Transcription:\n" + transcription_txt + "\n\n" | |
if transcribe_to_srt: | |
output += "SRT Transcription:\n" | |
for i, sub in enumerate(transcription_srt, 1): | |
output += f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n" | |
yield output | |
def format_time(seconds): | |
m, s = divmod(seconds, 60) | |
h, m = divmod(m, 60) | |
return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',') | |
iface = gr.Interface( | |
fn=transcribe, | |
inputs=[ | |
gr.Audio(type="filepath"), | |
gr.Checkbox(label="Transcribe to Text"), | |
gr.Checkbox(label="Transcribe to SRT"), | |
gr.Dropdown(choices=['en', 'he', 'it', 'fr', 'de', 'zh', 'ar'], label="Language") | |
], | |
outputs="text", | |
title="WhisperCap Transcription", | |
description="Upload an audio file to transcribe it using Whisper.", | |
) | |
iface.launch() |