File size: 4,584 Bytes
39fb316
4f7e40d
665b7ce
4f7e40d
665b7ce
9faf370
665b7ce
 
3b6f0da
665b7ce
9faf370
665b7ce
 
4f7e40d
665b7ce
 
 
4f7e40d
 
9faf370
4f7e40d
 
665b7ce
 
 
 
 
 
 
 
 
 
a3382ae
4f7e40d
9faf370
665b7ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f7e40d
665b7ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import torch
import time
import threading
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer

# === Carregar modelo local (CPU) ===
model_name = "lambdaindie/lambda-1v-1B"  # troque pelo teu
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cpu")  # <- CPU aqui

# === Streamer global para interrupção ===
stop_signal = {"stop": False}

def generate_stream(prompt, max_tokens=512, temperature=0.7, top_p=0.95):
    stop_signal["stop"] = False
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    generation_thread = threading.Thread(
        target=model.generate,
        kwargs=dict(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            streamer=streamer,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id,
        )
    )
    generation_thread.start()

    output = ""
    for token in streamer:
        if stop_signal["stop"]:
            break
        output += token
        yield output.strip()

def stop_stream():
    stop_signal["stop"] = True

def respond(message, history, system_message, max_tokens, temperature, top_p):
    messages = [{"role": "system", "content": system_message}] if system_message else []

    for user, assistant in history[-3:]:  # Limita a 3 interações passadas
        if user:
            messages.append({"role": "user", "content": user})
        if assistant:
            messages.append({"role": "assistant", "content": assistant})

    thinking_prompt = messages + [{"role": "user", "content": f"{message}\n\nThink step-by-step before answering."}]
    thinking_text = "\n".join([f"{m['role']}: {m['content']}" for m in thinking_prompt])

    reasoning = ""
    yield '<div class="markdown-think">Thinking...</div>'

    start = time.time()
    for token in generate_stream(thinking_text, max_tokens, temperature, top_p):
        reasoning = token
        yield f'<div class="markdown-think">{reasoning.strip()}</div>'

    elapsed = time.time() - start
    yield f"""
    <div style="margin-top:12px;padding:8px 12px;background-color:#222;border-left:4px solid #888;
            font-family:'JetBrains Mono', monospace;color:#ccc;font-size:14px;">
        Pensou por {elapsed:.1f} segundos
    </div>
    """

    final_prompt = thinking_text + f"\n\nuser: {message}\nassistant: {reasoning.strip()}\nuser: Now answer based on your reasoning above.\nassistant:"
    final_answer = ""

    for token in generate_stream(final_prompt, max_tokens, temperature, top_p):
        final_answer = token
        yield final_answer.strip()

# === Interface ===

css = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono&display=swap');
* { font-family: 'JetBrains Mono', monospace !important; }
html, body, .gradio-container {
    background-color: #111 !important;
    color: #e0e0e0 !important;
}
textarea, input, button, select {
    background-color: transparent !important;
    color: #e0e0e0 !important;
    border: 1px solid #444 !important;
}
.markdown-think {
    background-color: #1e1e1e;
    border-left: 4px solid #555;
    padding: 10px;
    margin-bottom: 8px;
    font-style: italic;
    white-space: pre-wrap;
    animation: pulse 1.5s infinite ease-in-out;
}
@keyframes pulse {
    0% { opacity: 0.6; }
    50% { opacity: 1.0; }
    100% { opacity: 0.6; }
}
"""

theme = gr.themes.Base(
    primary_hue="gray",
    font=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"]
).set(
    body_background_fill="#111",
    body_text_color="#e0e0e0",
    input_background_fill="#222",
    input_border_color="#444",
    button_primary_background_fill="#333",
    button_primary_text_color="#e0e0e0",
)

chatbot = gr.ChatInterface(
    fn=respond,
    title="λambdAI",
    css=css,
    theme=theme,
    additional_inputs=[
        gr.Textbox(value="", label="System Message"),
        gr.Slider(64, 2048, value=512, step=1, label="Max Tokens"),
        gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
    ]
)

stop_btn = gr.Button("Parar Geração")
stop_btn.click(fn=stop_stream, inputs=[], outputs=[])

app = gr.Blocks()
with app:
    chatbot.render()
    stop_btn.render()

if __name__ == "__main__":
    app.launch(share=True)