import gradio as gr from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, WhisperProcessor # Load the model and tokenizer model_id = "openai/whisper-medium" model = AutoModelForSeq2SeqLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) # Create a WhisperProcessor instance processor = WhisperProcessor(model=model, tokenizer=tokenizer) # Define a function that takes an audio input and returns a transcription def transcribe(audio): # Use the processor to transcribe the audio transcription = processor.transcribe(audio) # Extract the confidence score and the duration from the transcription confidence = transcription.confidence duration = transcription.duration # Remove the special tokens from the transcription text text = transcription.text.replace("<|startoftranscript|>", "").replace("<|endoftranscript|>", "") # Return the text, confidence and duration as outputs return text, confidence, duration # Create a Gradio interface with two modes: realtime and file upload iface = gr.Interface( fn=transcribe, inputs=[ gr.inputs.Audio(source="microphone", type="numpy", label="Realtime Mode"), gr.inputs.Audio(source="upload", type="numpy", label="File Upload Mode") ], outputs=[ gr.outputs.Textbox(label="Transcription"), gr.outputs.Textbox(label="Confidence Score"), gr.outputs.Textbox(label="Duration (seconds)") ], title="Whisper Transcription App", description="A Gradio app that uses OpenAI's whisper model to transcribe audio" ) # Launch the app iface.launch()