import gradio as gr import torch import time import threading from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer # === Carregar modelo local (CPU) === model_name = "lambdaindie/lambda-1v-1B" # troque pelo teu tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name).to("cpu") # <- CPU aqui # === Streamer global para interrupção === stop_signal = {"stop": False} def generate_stream(prompt, max_tokens=512, temperature=0.7, top_p=0.95): stop_signal["stop"] = False inputs = tokenizer(prompt, return_tensors="pt").to("cpu") streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_thread = threading.Thread( target=model.generate, kwargs=dict( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], streamer=streamer, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_p=top_p, pad_token_id=tokenizer.eos_token_id, ) ) generation_thread.start() output = "" for token in streamer: if stop_signal["stop"]: break output += token yield output.strip() def stop_stream(): stop_signal["stop"] = True def respond(message, history, system_message, max_tokens, temperature, top_p): messages = [{"role": "system", "content": system_message}] if system_message else [] for user, assistant in history[-3:]: # Limita a 3 interações passadas if user: messages.append({"role": "user", "content": user}) if assistant: messages.append({"role": "assistant", "content": assistant}) thinking_prompt = messages + [{"role": "user", "content": f"{message}\n\nThink step-by-step before answering."}] thinking_text = "\n".join([f"{m['role']}: {m['content']}" for m in thinking_prompt]) reasoning = "" yield '
Thinking...
' start = time.time() for token in generate_stream(thinking_text, max_tokens, temperature, top_p): reasoning = token yield f'
{reasoning.strip()}
' elapsed = time.time() - start yield f"""
Pensou por {elapsed:.1f} segundos
""" final_prompt = thinking_text + f"\n\nuser: {message}\nassistant: {reasoning.strip()}\nuser: Now answer based on your reasoning above.\nassistant:" final_answer = "" for token in generate_stream(final_prompt, max_tokens, temperature, top_p): final_answer = token yield final_answer.strip() # === Interface === css = """ @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono&display=swap'); * { font-family: 'JetBrains Mono', monospace !important; } html, body, .gradio-container { background-color: #111 !important; color: #e0e0e0 !important; } textarea, input, button, select { background-color: transparent !important; color: #e0e0e0 !important; border: 1px solid #444 !important; } .markdown-think { background-color: #1e1e1e; border-left: 4px solid #555; padding: 10px; margin-bottom: 8px; font-style: italic; white-space: pre-wrap; animation: pulse 1.5s infinite ease-in-out; } @keyframes pulse { 0% { opacity: 0.6; } 50% { opacity: 1.0; } 100% { opacity: 0.6; } } """ theme = gr.themes.Base( primary_hue="gray", font=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"] ).set( body_background_fill="#111", body_text_color="#e0e0e0", input_background_fill="#222", input_border_color="#444", button_primary_background_fill="#333", button_primary_text_color="#e0e0e0", ) chatbot = gr.ChatInterface( fn=respond, title="λambdAI", css=css, theme=theme, additional_inputs=[ gr.Textbox(value="", label="System Message"), gr.Slider(64, 2048, value=512, step=1, label="Max Tokens"), gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p") ] ) stop_btn = gr.Button("Parar Geração") stop_btn.click(fn=stop_stream, inputs=[], outputs=[]) app = gr.Blocks() with app: chatbot.render() stop_btn.render() if __name__ == "__main__": app.launch(share=True)