import gradio as gr from huggingface_hub import InferenceClient import time client = InferenceClient("lambdaindie/lambdai") def respond(message, history, system_message, max_tokens, temperature, top_p): # Build base message history messages = [{"role": "system", "content": system_message}] if system_message else [] for user, assistant in history: if user: messages.append({"role": "user", "content": user}) if assistant: messages.append({"role": "assistant", "content": assistant}) # Phase 1 — Thinking aloud (reasoning step) thinking_prompt = messages + [ { "role": "user", "content": f"{message}\n\nThink step-by-step before answering." } ] reasoning = "" yield "**Thinking...**\n```markdown\n```" # Trigger gray markdown block for chunk in client.chat_completion( thinking_prompt, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = chunk.choices[0].delta.content or "" reasoning += token yield f"**Thinking...**\n```markdown\n{reasoning.strip()}```" time.sleep(0.5) # Optional dramatic pause # Phase 2 — Final answer final_prompt = messages + [ {"role": "user", "content": message}, {"role": "assistant", "content": reasoning.strip()}, {"role": "user", "content": "Now answer based on your reasoning above."} ] final_answer = "" for chunk in client.chat_completion( final_prompt, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = chunk.choices[0].delta.content or "" final_answer += token yield final_answer.strip() demo = gr.ChatInterface( respond, title="LENIRΛ", theme=gr.themes.Base(primary_hue="gray", font=["JetBrains Mono", "monospace"]), additional_inputs=[ gr.Textbox( value="You are a concise, logical AI that explains its reasoning clearly before answering.", label="System Message" ), gr.Slider(64, 2048, value=512, step=1, label="Max Tokens"), gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p") ] ) if __name__ == "__main__": demo.launch()