whisperx-test / app.py
sdafd's picture
Create app.py
df44cb1 verified
raw
history blame
3.63 kB
import gradio as gr
import whisperx
import torch
import librosa
import logging
import os
import time
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("whisperx_app")
# Device setup (force CPU)
device = "cpu"
torch.set_num_threads(os.cpu_count())
# Pre-load models
models = {
"tiny": whisperx.load_model("tiny", device),
"base": whisperx.load_model("base", device),
"small": whisperx.load_model("small", device),
}
def transcribe(audio_file, model_size="base", debug=False):
start_time = time.time()
result = ""
debug_log = []
try:
# Load audio file
audio, sr = librosa.load(audio_file, sr=16000)
# Run inference
model = models[model_size]
batch_size = 8 if model_size == "tiny" else 4
transcript = model.transcribe(audio, batch_size=batch_size)
# Align whisper output
model_a, metadata = whisperx.load_align_model(
language_code=transcript["language"], device=device
)
transcript_aligned = whisperx.align(
transcript["segments"], model_a, metadata, audio, device
)
# Format word-level output
for segment in transcript_aligned["segments"]:
for word in segment["words"]:
result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n"
debug_log.append(f"Processed in {time.time()-start_time:.2f}s")
debug_log.append(f"Language detected: {transcript['language']}")
debug_log.append(f"Batch size: {batch_size}")
except Exception as e:
logger.error("Error during transcription:", exc_info=True)
result = "Error occurred during transcription"
debug_log.append(f"ERROR: {str(e)}")
if debug:
return result, "\n".join(debug_log)
return result
# Gradio Interface
with gr.Blocks(title="WhisperX CPU Transcription") as demo:
gr.Markdown("# WhisperX CPU Transcription with Word-Level Timestamps")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="Upload Audio File",
type="filepath",
sources=["upload", "microphone"],
interactive=True,
)
model_selector = gr.Dropdown(
choices=["tiny", "base", "small"],
value="base",
label="Model Size",
interactive=True,
)
debug_checkbox = gr.Checkbox(label="Enable Debug Mode", value=False)
transcribe_btn = gr.Button("Transcribe", variant="primary")
with gr.Column():
output_text = gr.Textbox(
label="Transcription Output",
lines=20,
placeholder="Transcription will appear here...",
)
debug_output = gr.Textbox(
label="Debug Information",
lines=10,
placeholder="Debug logs will appear here...",
visible=False,
)
# Toggle debug visibility
def toggle_debug(debug_enabled):
return gr.update(visible=debug_enabled)
debug_checkbox.change(
toggle_debug,
inputs=[debug_checkbox],
outputs=[debug_output]
)
# Process transcription
transcribe_btn.click(
transcribe,
inputs=[audio_input, model_selector, debug_checkbox],
outputs=[output_text, debug_output]
)
# Launch configuration
if __name__ == "__main__":
demo.queue(max_size=4).launch()