import gradio as gr import torch from transformers import ( AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, ) import threading import time # ----------------------------------------------------------------------------- # 1. MODEL LOADING # ----------------------------------------------------------------------------- # In this advanced example, we'll instantiate the model directly (instead of using pipeline). # We'll do streaming outputs via TextIteratorStreamer. MODEL_NAME = "microsoft/phi-4" # Replace with an actual HF model if phi-4 is unavailable DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") try: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto") except: # Fallback if model is not found or large. Here we default to a smaller model MODEL_NAME = "gpt2" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE) model.eval() # ----------------------------------------------------------------------------- # 2. CONVERSATION / PROMPTS # ----------------------------------------------------------------------------- # We'll keep track of conversation using a list of dictionaries: # [ # {"role": "system", "content": "..."}, # {"role": "developer", "content": "..."}, # {"role": "user", "content": "User message"}, # {"role": "assistant", "content": "Assistant answer"}, # ... # ] # # We’ll also build in a mock retrieval system that merges knowledge snippets # into the final prompt if the user chooses to do so. DEFAULT_SYSTEM_PROMPT = ( "You are Philos, an advanced AI system created by ACC (Algorithmic Computer-generated Consciousness). " "Answer user queries accurately, thoroughly, and helpfully. Keep your responses relevant and correct." ) DEFAULT_DEVELOPER_PROMPT = ( "Ensure that you respond in a style that is professional, clear, and approachable. " "Include reasoning steps if needed, but keep them concise." ) # A small dictionary to emulate knowledge retrieval # In real scenarios, you might use an actual vector DB + retrieval method MOCK_KB = { "python": "Python is a high-level, interpreted programming language famous for its readability and flexibility.", "accelerate library": "The accelerate library by HF helps in distributed training and inference.", "phi-4 architecture": "phi-4 is a 14B-parameter, decoder-only Transformer with a 16K context window.", } def retrieve_knowledge(user_query): # Simple naive approach: check keywords in user query # Return a knowledge snippet if found matches = [] for keyword, snippet in MOCK_KB.items(): if keyword.lower() in user_query.lower(): matches.append(snippet) return matches # ----------------------------------------------------------------------------- # 3. HELPER: Build the prompt from conversation # ----------------------------------------------------------------------------- def build_prompt(conversation): """ Convert conversation (list of role/content dicts) into a single text prompt that the model can process. We adopt a simple format: system, developer, user, assistant, ... """ prompt = "" for msg in conversation: if msg["role"] == "system": prompt += f"[System]\n{msg['content']}\n" elif msg["role"] == "developer": prompt += f"[Developer]\n{msg['content']}\n" elif msg["role"] == "user": prompt += f"[User]\n{msg['content']}\n" else: # assistant prompt += f"[Assistant]\n{msg['content']}\n" prompt += "[Assistant]\n" # We end with an assistant role so model can continue return prompt # ----------------------------------------------------------------------------- # 4. STREAMING GENERATION # ----------------------------------------------------------------------------- def generate_tokens_stream(prompt, temperature=0.7, top_p=0.9, max_new_tokens=128): """ Uses TextIteratorStreamer to yield tokens one by one (or in small chunks). """ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE) generation_kwargs = dict( input_ids=input_ids, streamer=streamer, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) # We'll run generation in a background thread, streaming tokens thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # Stream tokens partial_text = "" for new_token in streamer: partial_text += new_token yield partial_text thread.join() # ----------------------------------------------------------------------------- # 5. MAIN CHAT FUNCTION # ----------------------------------------------------------------------------- def advanced_chat(user_msg, conversation, system_prompt, dev_prompt, retrieve_flg, temperature, top_p): """ - Update conversation with the user's message - Optionally retrieve knowledge and incorporate into the system or developer prompt - Build the final prompt - Stream the assistant's reply """ # If user message is empty if not user_msg.strip(): yield "Please enter a message." return # 1) Construct or update system/dev prompts system_message = {"role": "system", "content": system_prompt} developer_message = {"role": "developer", "content": dev_prompt} # 2) Insert or replace system/dev in the conversation # We'll assume the first system/dev messages are at the start of conversation # or add them if not present filtered = [msg for msg in conversation if msg["role"] not in ["system", "developer"]] conversation = [system_message, developer_message] + filtered # 3) Append user's message conversation.append({"role": "user", "content": user_msg}) # 4) Retrieve knowledge if user toggled "Include knowledge retrieval" if retrieve_flg: knowledge_snippets = retrieve_knowledge(user_msg) if knowledge_snippets: # We can just append them to developer or system content for simplicity knowledge_text = "\n".join(["[Knowledge] " + s for s in knowledge_snippets]) conversation[1]["content"] += f"\n\n[Additional Knowledge]\n{knowledge_text}" # 5) Build final prompt prompt = build_prompt(conversation) # 6) Stream the assistant’s response partial_response = "" for partial_text in generate_tokens_stream(prompt, temperature, top_p): partial_response = partial_text yield partial_text # Send partial tokens to Gradio for display # 7) Now that generation is complete, append final assistant message conversation.append({"role": "assistant", "content": partial_response}) # ----------------------------------------------------------------------------- # 6. BUILD GRADIO INTERFACE # ----------------------------------------------------------------------------- def build_ui(): with gr.Blocks(title="PhilosBeta-Advanced", css="#chatbot{height:550px} .overflow-y-auto{max-height:550px}") as demo: gr.Markdown("# **PhilosBeta: Advanced Demo**") gr.Markdown( "An example of multi-turn conversation with streaming responses, " "optional retrieval, and custom system/developer prompts." ) # State to store the conversation as a list of role/content dicts conversation_state = gr.State([]) # TEXT ELEMENTS with gr.Row(): with gr.Column(): system_prompt_box = gr.Textbox( label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=3 ) developer_prompt_box = gr.Textbox( label="Developer Prompt", value=DEFAULT_DEVELOPER_PROMPT, lines=3 ) with gr.Column(): retrieve_flag = gr.Checkbox(label="Include Knowledge Retrieval", value=False) temperature_slider = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature") top_p_slider = gr.Slider(0.0, 1.0, 0.9, step=0.05, label="Top-p") max_tokens_info = gr.Markdown("Max new tokens = 128 (fixed in code).") # MAIN CHAT UI chatbox = gr.Chatbot(label="Philos Conversation", elem_id="chatbot").style(height=500) user_input = gr.Textbox( label="Your Message", placeholder="Type here...", lines=3 ) send_btn = gr.Button("Send", variant="primary") # --------------------------------------------------------------------- # ACTION: Handle user input # --------------------------------------------------------------------- def user_send( user_text, conversation, sys_prompt, dev_prompt, retrieve_flg, temperature, top_p ): """ This function calls advanced_chat() and streams tokens back to update the Chatbot UI. """ # We'll create a generator to update the Chatbot in real-time message_stream = advanced_chat( user_msg=user_text, conversation=conversation, system_prompt=sys_prompt, dev_prompt=dev_prompt, retrieve_flg=retrieve_flg, temperature=temperature, top_p=top_p ) return message_stream, conversation # Gradio can handle generator outputs for streaming. # We map the streamed tokens to the Chatbot component in real-time. chatbox_stream = gr.Chatbot.update() send_btn.click( fn=user_send, inputs=[ user_input, conversation_state, system_prompt_box, developer_prompt_box, retrieve_flag, temperature_slider, top_p_slider, ], outputs=[chatbox_stream, conversation_state], ) # We also let the user press Enter to send messages user_input.submit( fn=user_send, inputs=[ user_input, conversation_state, system_prompt_box, developer_prompt_box, retrieve_flag, temperature_slider, top_p_slider, ], outputs=[chatbox_stream, conversation_state], ) return demo # ----------------------------------------------------------------------------- # 7. LAUNCH # ----------------------------------------------------------------------------- if __name__ == "__main__": ui = build_ui() ui.launch()