|
import gradio as gr |
|
import whisperx |
|
import whisper |
|
import torch |
|
|
|
def transcribe(audio_file): |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
model = whisper.load_model("large", device) |
|
result = model.transcribe(audio_file) |
|
|
|
|
|
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) |
|
|
|
|
|
result_aligned = whisperx.align(result["segments"], model_a, metadata, audio_file, device) |
|
|
|
return {"aligned": result_aligned["segments"], "word_segments": result_aligned["word_segments"]} |
|
|
|
inputs = gr.Audio(source="upload", type="filepath") |
|
outputs = gr.JSON() |
|
|
|
gr.Interface(fn=transcribe, inputs=inputs, outputs=outputs).launch() |
|
|