import gradio as gr from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor # Load the model and processor model_id = "openai/whisper-medium" processor = WhisperProcessor.from_pretrained(model_id) model = WhisperForConditionalGeneration.from_pretrained(model_id) model.config.forced_decoder_ids = None def transcribelocal(microphone, file_upload): # Check which input is not None if microphone is not None: audio = microphone else: audio = file_upload # Use the processor to transcribe the audio transcription = processor.transcribe(audio, 48) # Extract the confidence score and the duration from the transcription confidence = transcription.confidence duration = transcription.duration # Remove the special tokens from the transcription text text = transcription.text.replace("<|startoftranscript|>", "").replace("<|endoftranscript|>", "") # Return the text, confidence and duration as outputs return text, confidence, duration # Create a Gradio interface with two modes: realtime and file upload iface = gr.Interface( fn=transcribelocal, inputs=[ gr.inputs.Audio(source="microphone", type="numpy", label="Realtime Mode"), gr.inputs.Audio(source="upload", type="numpy", label="File Upload Mode") ], outputs=[ gr.outputs.Textbox(label="Transcription"), gr.outputs.Textbox(label="Confidence Score"), gr.outputs.Textbox(label="Duration (seconds)") ], title="Whisper Transcription App", description="A Gradio app that uses OpenAI's whisper model to transcribe audio" ) # Launch the app iface.launch()