mariusjabami commited on
Commit
1906c0e
·
verified ·
1 Parent(s): cc5d25c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -48
app.py CHANGED
@@ -1,21 +1,20 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
3
  import time
4
 
5
- client = InferenceClient("lambdaindie/lambdai")
 
 
 
 
 
6
 
7
  css = """
8
  @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono&display=swap');
9
-
10
- * {
11
- font-family: 'JetBrains Mono', monospace !important;
12
- }
13
-
14
- body {
15
- background-color: #111;
16
- color: #e0e0e0;
17
- }
18
-
19
  .markdown-think {
20
  background-color: #1e1e1e;
21
  border-left: 4px solid #555;
@@ -25,7 +24,6 @@ body {
25
  white-space: pre-wrap;
26
  animation: pulse 1.5s infinite ease-in-out;
27
  }
28
-
29
  @keyframes pulse {
30
  0% { opacity: 0.6; }
31
  50% { opacity: 1.0; }
@@ -42,54 +40,55 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
42
  if assistant:
43
  messages.append({"role": "assistant", "content": assistant})
44
 
45
- thinking_prompt = messages + [{
46
- "role": "user",
47
- "content": f"{message}\n\nThink a bit step-by-step before answering."
48
- }]
 
49
 
50
  reasoning = ""
51
  yield '<div class="markdown-think">Thinking...</div>'
52
-
53
  start = time.time()
54
 
55
- for chunk in client.chat_completion(
56
- thinking_prompt,
57
- max_tokens=max_tokens,
58
- stream=True,
59
- temperature=temperature,
60
- top_p=top_p,
61
- ):
62
- token = chunk.choices[0].delta.content or ""
 
 
63
  reasoning += token
64
- styled_thought = f'<div class="markdown-think">{reasoning.strip()}</div>'
65
- yield styled_thought
66
 
67
  elapsed = time.time() - start
 
 
 
68
 
69
- yield f"""
70
- <div style="margin-top:12px;padding:8px 12px;background-color:#222;border-left:4px solid #888;
71
- font-family:'JetBrains Mono', monospace;color:#ccc;font-size:14px;">
72
- Pensou por {elapsed:.1f} segundos
73
- </div>
74
- """
75
-
76
- time.sleep(2)
77
-
78
  final_prompt = messages + [
79
  {"role": "user", "content": message},
80
  {"role": "assistant", "content": reasoning.strip()},
81
- {"role": "user", "content": "Now answer based on your reasoning above."}
82
  ]
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  final_answer = ""
85
- for chunk in client.chat_completion(
86
- final_prompt,
87
- max_tokens=max_tokens,
88
- stream=True,
89
- temperature=temperature,
90
- top_p=top_p,
91
- ):
92
- token = chunk.choices[0].delta.content or ""
93
  final_answer += token
94
  yield final_answer.strip()
95
 
@@ -99,8 +98,7 @@ demo = gr.ChatInterface(
99
  theme=gr.themes.Base(),
100
  css=css,
101
  additional_inputs=[
102
- gr.Textbox(value="",
103
- label="System Message"),
104
  gr.Slider(64, 2048, value=512, step=1, label="Max Tokens"),
105
  gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"),
106
  gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
3
+ import torch
4
+ import threading
5
  import time
6
 
7
+ model_id = "lambdaindie/lambdai"
8
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
9
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
10
+
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ model.to(device)
13
 
14
  css = """
15
  @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono&display=swap');
16
+ * { font-family: 'JetBrains Mono', monospace !important; }
17
+ body { background-color: #111; color: #e0e0e0; }
 
 
 
 
 
 
 
 
18
  .markdown-think {
19
  background-color: #1e1e1e;
20
  border-left: 4px solid #555;
 
24
  white-space: pre-wrap;
25
  animation: pulse 1.5s infinite ease-in-out;
26
  }
 
27
  @keyframes pulse {
28
  0% { opacity: 0.6; }
29
  50% { opacity: 1.0; }
 
40
  if assistant:
41
  messages.append({"role": "assistant", "content": assistant})
42
 
43
+ thinking_prompt = messages + [{"role": "user", "content": f"{message}\n\nThink step-by-step."}]
44
+ prompt = tokenizer.apply_chat_template(thinking_prompt, tokenize=False, add_generation_prompt=True)
45
+
46
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
47
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
48
 
49
  reasoning = ""
50
  yield '<div class="markdown-think">Thinking...</div>'
 
51
  start = time.time()
52
 
53
+ thread = threading.Thread(target=model.generate, kwargs={
54
+ "inputs": inputs["input_ids"],
55
+ "max_new_tokens": max_tokens,
56
+ "temperature": temperature,
57
+ "top_p": top_p,
58
+ "streamer": streamer,
59
+ })
60
+ thread.start()
61
+
62
+ for token in streamer:
63
  reasoning += token
64
+ yield f'<div class="markdown-think">{reasoning.strip()}</div>'
 
65
 
66
  elapsed = time.time() - start
67
+ yield f"""<div style="margin-top:12px;padding:8px 12px;background-color:#222;border-left:4px solid #888;
68
+ font-family:'JetBrains Mono', monospace;color:#ccc;font-size:14px;">
69
+ Pensou por {elapsed:.1f} segundos</div>"""
70
 
71
+ # Segunda etapa: resposta final
 
 
 
 
 
 
 
 
72
  final_prompt = messages + [
73
  {"role": "user", "content": message},
74
  {"role": "assistant", "content": reasoning.strip()},
75
+ {"role": "user", "content": "Agora responda baseado nisso."}
76
  ]
77
+ prompt2 = tokenizer.apply_chat_template(final_prompt, tokenize=False, add_generation_prompt=True)
78
+ inputs2 = tokenizer(prompt2, return_tensors="pt").to(device)
79
+ streamer2 = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
80
+
81
+ thread2 = threading.Thread(target=model.generate, kwargs={
82
+ "inputs": inputs2["input_ids"],
83
+ "max_new_tokens": max_tokens,
84
+ "temperature": temperature,
85
+ "top_p": top_p,
86
+ "streamer": streamer2,
87
+ })
88
+ thread2.start()
89
 
90
  final_answer = ""
91
+ for token in streamer2:
 
 
 
 
 
 
 
92
  final_answer += token
93
  yield final_answer.strip()
94
 
 
98
  theme=gr.themes.Base(),
99
  css=css,
100
  additional_inputs=[
101
+ gr.Textbox(value="", label="System Message"),
 
102
  gr.Slider(64, 2048, value=512, step=1, label="Max Tokens"),
103
  gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"),
104
  gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")