import gradio as gr import whisperx import torch import librosa import logging import os import time # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger("whisperx_app") # Device setup (force CPU) device = "cpu" compute_type = "int8" torch.set_num_threads(os.cpu_count()) # Pre-load models models = { "tiny": whisperx.load_model("tiny", device, compute_type=compute_type, vad_method='silero'), "base": whisperx.load_model("base", device, compute_type=compute_type, vad_method='silero'), "small": whisperx.load_model("small", device, compute_type=compute_type, vad_method='silero'), "large": whisperx.load_model("large", device, compute_type=compute_type, vad_method='silero'), "large-v2": whisperx.load_model("large-v2", device, compute_type=compute_type, vad_method='silero'), "large-v3": whisperx.load_model("large-v3", device, compute_type=compute_type, vad_method='silero'), } def transcribe(audio_file, model_size="base", debug=False): start_time = time.time() result = "" debug_log = [] try: # Load audio file audio, sr = librosa.load(audio_file, sr=16000) # Run inference model = models[model_size] batch_size = 8 if model_size == "tiny" else 4 transcript = model.transcribe(audio, batch_size=batch_size) # Align whisper output model_a, metadata = whisperx.load_align_model( language_code=transcript["language"], device=device ) transcript_aligned = whisperx.align( transcript["segments"], model_a, metadata, audio, device ) # Format word-level output for segment in transcript_aligned["segments"]: for word in segment["words"]: result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n" debug_log.append(f"Processed in {time.time()-start_time:.2f}s") debug_log.append(f"Language detected: {transcript['language']}") debug_log.append(f"Batch size: {batch_size}") except Exception as e: logger.error("Error during transcription:", exc_info=True) result = "Error occurred during transcription" debug_log.append(f"ERROR: {str(e)}") if debug: return result, "\n".join(debug_log) return result # Gradio Interface with gr.Blocks(title="WhisperX CPU Transcription") as demo: gr.Markdown("# WhisperX CPU Transcription with Word-Level Timestamps") with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="Upload Audio File", type="filepath", sources=["upload", "microphone"], interactive=True, ) model_selector = gr.Dropdown( choices=models.keys(), value="base", label="Model Size", interactive=True, ) debug_checkbox = gr.Checkbox(label="Enable Debug Mode", value=False) transcribe_btn = gr.Button("Transcribe", variant="primary") with gr.Column(): output_text = gr.Textbox( label="Transcription Output", lines=20, placeholder="Transcription will appear here...", ) debug_output = gr.Textbox( label="Debug Information", lines=10, placeholder="Debug logs will appear here...", visible=False, ) # Toggle debug visibility def toggle_debug(debug_enabled): return gr.update(visible=debug_enabled) debug_checkbox.change( toggle_debug, inputs=[debug_checkbox], outputs=[debug_output] ) # Process transcription transcribe_btn.click( transcribe, inputs=[audio_input, model_selector, debug_checkbox], outputs=[output_text, debug_output] ) # Launch configuration if __name__ == "__main__": demo.queue(max_size=4).launch()