import os import torch import gradio as gr from transformers import pipeline from huggingface_hub import InferenceClient # ---------------------- # AUDIO-TO-TEXT SETUP # ---------------------- device = 0 if torch.cuda.is_available() else "cpu" AUDIO_MODEL_NAME = "distil-whisper/distil-large-v3" BATCH_SIZE = 8 pipe = pipeline( task="automatic-speech-recognition", model=AUDIO_MODEL_NAME, chunk_length_s=30, device=device, ) def transcribe(audio_input): """Convert audio to text using Whisper.""" if audio_input is None: raise gr.Error("No audio file submitted!") output = pipe( audio_input, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True ) return output["text"] # ---------------------- # TEXT ORGANIZATION SETUP # ---------------------- TEXT_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct" # Ensure HF_TOKEN is loaded as a Space secret hf_token = os.getenv("HF_TOKEN") if not hf_token: raise ValueError("HF_TOKEN not found! Add it as a secret in your Space settings.") # Force client to use the HF inference API client = InferenceClient(token=hf_token, base_url="https://api-inference.huggingface.co") def build_messages(meeting_transcript) -> list: system_input = "You are an assistant that organizes meeting minutes." user_input = f""" Take this raw meeting transcript and return an organized, sectioned version. You may include a summary at the top. Transcript: {meeting_transcript} """ return [ {"role": "system", "content": system_input}, {"role": "user", "content": user_input}, ] def organize_text(meeting_transcript): messages = build_messages(meeting_transcript) response = client.chat_completion( messages, model=TEXT_MODEL_NAME, max_tokens=300, seed=42 ) return response.choices[0].message.content # ---------------------- # COMBINED TOOL # ---------------------- def meeting_transcript_tool(audio_input): meeting_text = transcribe(audio_input) organized_text = organize_text(meeting_text) return organized_text # ---------------------- # GRADIO INTERFACE # ---------------------- demo = gr.Interface( fn=meeting_transcript_tool, inputs=gr.Audio(type="filepath"), outputs=gr.Textbox(show_copy_button=True, label="Organized Transcript"), title="🪶 Meeting Transcription Tool", description="Upload or record an audio file. This app transcribes it using Whisper and organizes the text using Phi-3", ) demo.launch()