import numpy as np import gradio as gr from dotenv import load_dotenv from fastrtc import ( ReplyOnPause, Stream, AdditionalOutputs, get_current_context, get_hf_turn_credentials, get_hf_turn_credentials_async, get_stt_model, get_tts_model, WebRTCError, ) import gradio as gr from huggingface_hub import InferenceClient load_dotenv() stt_model = get_stt_model() tts_model = get_tts_model() conversations: dict[str, list[dict[str, str]]] = {} llm_client = InferenceClient(provider="groq") def response( audio: tuple[int, np.ndarray], hf_token: str | None, ): if hf_token is None: raise WebRTCError("HF Token is required") context = get_current_context() if context.webrtc_id not in conversations: conversations[context.webrtc_id] = [ { "role": "system", "content": ( "You are a helpful assistant that can have engaging conversations." "Your responses must be very short and concise. No more than two sentences. " ), } ] messages = conversations[context.webrtc_id] transcription = stt_model.stt(audio) messages.append({"role": "user", "content": transcription}) output = llm_client.chat.completions.create( # type: ignore model="openai/gpt-oss-20b", messages=messages, # type: ignore max_tokens=1024, stream=True, token=hf_token, ) output_text = "" for chunk in output: output_text += chunk.choices[0].delta.content or "" messages.append({"role": "assistant", "content": output_text}) conversations[context.webrtc_id] = messages yield from tts_model.stream_tts_sync(output_text) yield AdditionalOutputs(messages) chatbot = gr.Chatbot(label="Chatbot", type="messages") token = gr.Textbox( label="HF Token", value="", type="password", ) stream = Stream( modality="audio", mode="send-receive", handler=ReplyOnPause(response), server_rtc_configuration=get_hf_turn_credentials(), rtc_configuration=get_hf_turn_credentials_async, additional_inputs=[token], additional_outputs=[chatbot], additional_outputs_handler=lambda old, new: new, ui_args={"title": "Talk To OpenAI GPT-OSS 20B (Powered by FastRTC ⚡️)"}, ) stream.ui.launch()