File size: 3,804 Bytes
eb450e3
1906c0e
 
 
582395b
eb450e3
1906c0e
 
 
 
 
 
eb450e3
b51f88d
5113576
1906c0e
 
582395b
45634ef
582395b
 
45634ef
582395b
5113576
582395b
e3c453c
3cfecb5
582395b
 
 
fa8b0f1
 
 
582395b
 
 
 
 
 
 
 
 
1906c0e
 
 
 
 
582395b
 
45634ef
f004227
 
1906c0e
 
 
 
 
 
 
 
 
 
582395b
1906c0e
eb450e3
f004227
1906c0e
 
 
f004227
1906c0e
da0a172
 
 
1906c0e
da0a172
1906c0e
 
 
 
 
 
 
 
 
 
 
 
da0a172
 
1906c0e
da0a172
 
 
5113576
 
 
 
 
 
1906c0e
5113576
 
 
 
 
eb450e3
582395b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import torch
import threading
import time

model_id = "lambdaindie/lambdai"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

css = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono&display=swap');
* { font-family: 'JetBrains Mono', monospace !important; }
body { background-color: #111; color: #e0e0e0; }
.markdown-think {
    background-color: #1e1e1e;
    border-left: 4px solid #555;
    padding: 10px;
    margin-bottom: 8px;
    font-style: italic;
    white-space: pre-wrap;
    animation: pulse 1.5s infinite ease-in-out;
}
@keyframes pulse {
    0% { opacity: 0.6; }
    50% { opacity: 1.0; }
    100% { opacity: 0.6; }
}
"""

def respond(message, history, system_message, max_tokens, temperature, top_p):
    messages = [{"role": "system", "content": system_message}] if system_message else []

    for user, assistant in history:
        if user:
            messages.append({"role": "user", "content": user})
        if assistant:
            messages.append({"role": "assistant", "content": assistant})

    thinking_prompt = messages + [{"role": "user", "content": f"{message}\n\nThink step-by-step."}]
    prompt = tokenizer.apply_chat_template(thinking_prompt, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    reasoning = ""
    yield '<div class="markdown-think">Thinking...</div>'
    start = time.time()

    thread = threading.Thread(target=model.generate, kwargs={
        "inputs": inputs["input_ids"],
        "max_new_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "streamer": streamer,
    })
    thread.start()

    for token in streamer:
        reasoning += token
        yield f'<div class="markdown-think">{reasoning.strip()}</div>'

    elapsed = time.time() - start
    yield f"""<div style="margin-top:12px;padding:8px 12px;background-color:#222;border-left:4px solid #888;
              font-family:'JetBrains Mono', monospace;color:#ccc;font-size:14px;">
              Pensou por {elapsed:.1f} segundos</div>"""

    # Segunda etapa: resposta final
    final_prompt = messages + [
        {"role": "user", "content": message},
        {"role": "assistant", "content": reasoning.strip()},
        {"role": "user", "content": "Agora responda baseado nisso."}
    ]
    prompt2 = tokenizer.apply_chat_template(final_prompt, tokenize=False, add_generation_prompt=True)
    inputs2 = tokenizer(prompt2, return_tensors="pt").to(device)
    streamer2 = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    thread2 = threading.Thread(target=model.generate, kwargs={
        "inputs": inputs2["input_ids"],
        "max_new_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "streamer": streamer2,
    })
    thread2.start()

    final_answer = ""
    for token in streamer2:
        final_answer += token
        yield final_answer.strip()

demo = gr.ChatInterface(
    fn=respond,
    title="λambdAI",
    theme=gr.themes.Base(),
    css=css,
    additional_inputs=[
        gr.Textbox(value="", label="System Message"),
        gr.Slider(64, 2048, value=512, step=1, label="Max Tokens"),
        gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
    ]
)

if __name__ == "__main__":
    demo.launch()