Spaces:
Sleeping
Sleeping
import gradio as gr | |
import whisperx | |
import torch | |
import librosa | |
import logging | |
import os | |
import time | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger("whisperx_app") | |
# Device setup (force CPU) | |
device = "cpu" | |
torch.set_num_threads(os.cpu_count()) | |
# Pre-load models | |
models = { | |
"tiny": whisperx.load_model("tiny", device), | |
"base": whisperx.load_model("base", device), | |
"small": whisperx.load_model("small", device), | |
} | |
def transcribe(audio_file, model_size="base", debug=False): | |
start_time = time.time() | |
result = "" | |
debug_log = [] | |
try: | |
# Load audio file | |
audio, sr = librosa.load(audio_file, sr=16000) | |
# Run inference | |
model = models[model_size] | |
batch_size = 8 if model_size == "tiny" else 4 | |
transcript = model.transcribe(audio, batch_size=batch_size) | |
# Align whisper output | |
model_a, metadata = whisperx.load_align_model( | |
language_code=transcript["language"], device=device | |
) | |
transcript_aligned = whisperx.align( | |
transcript["segments"], model_a, metadata, audio, device | |
) | |
# Format word-level output | |
for segment in transcript_aligned["segments"]: | |
for word in segment["words"]: | |
result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n" | |
debug_log.append(f"Processed in {time.time()-start_time:.2f}s") | |
debug_log.append(f"Language detected: {transcript['language']}") | |
debug_log.append(f"Batch size: {batch_size}") | |
except Exception as e: | |
logger.error("Error during transcription:", exc_info=True) | |
result = "Error occurred during transcription" | |
debug_log.append(f"ERROR: {str(e)}") | |
if debug: | |
return result, "\n".join(debug_log) | |
return result | |
# Gradio Interface | |
with gr.Blocks(title="WhisperX CPU Transcription") as demo: | |
gr.Markdown("# WhisperX CPU Transcription with Word-Level Timestamps") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio( | |
label="Upload Audio File", | |
type="filepath", | |
sources=["upload", "microphone"], | |
interactive=True, | |
) | |
model_selector = gr.Dropdown( | |
choices=["tiny", "base", "small"], | |
value="base", | |
label="Model Size", | |
interactive=True, | |
) | |
debug_checkbox = gr.Checkbox(label="Enable Debug Mode", value=False) | |
transcribe_btn = gr.Button("Transcribe", variant="primary") | |
with gr.Column(): | |
output_text = gr.Textbox( | |
label="Transcription Output", | |
lines=20, | |
placeholder="Transcription will appear here...", | |
) | |
debug_output = gr.Textbox( | |
label="Debug Information", | |
lines=10, | |
placeholder="Debug logs will appear here...", | |
visible=False, | |
) | |
# Toggle debug visibility | |
def toggle_debug(debug_enabled): | |
return gr.update(visible=debug_enabled) | |
debug_checkbox.change( | |
toggle_debug, | |
inputs=[debug_checkbox], | |
outputs=[debug_output] | |
) | |
# Process transcription | |
transcribe_btn.click( | |
transcribe, | |
inputs=[audio_input, model_selector, debug_checkbox], | |
outputs=[output_text, debug_output] | |
) | |
# Launch configuration | |
if __name__ == "__main__": | |
demo.queue(max_size=4).launch() |