AC-Angelo93 commited on
Commit
c256c10
·
verified ·
1 Parent(s): 458ea2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -40
app.py CHANGED
@@ -1,67 +1,108 @@
 
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
 
 
 
 
 
 
4
 
5
- tokenizer = AutoTokenizer.from_pretrained("Fastweb/FastwebMIIA-7B")
6
- model = AutoModelForCausalLM.from_pretrained("Fastweb/FastwebMIIA-7B")
 
 
 
 
 
 
 
 
 
 
 
7
 
 
8
 
9
  def respond(
10
- message,
11
  history: list[tuple[str, str]],
12
- system_message,
13
- max_tokens,
14
- temperature,
15
- top_p,
16
  ):
17
- messages = [{"role": "system", "content": system_message}]
 
 
 
 
 
 
 
18
 
19
- for val in history:
20
- if val[0]:
21
- messages.append({"role": "user", "content": val[0]})
22
- if val[1]:
23
- messages.append({"role": "assistant", "content": val[1]})
24
 
25
  messages.append({"role": "user", "content": message})
26
 
27
- # Format messages for the model
28
- input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
29
- input_ids = tokenizer(input_text, return_tensors="pt").input_ids
30
-
31
- # Generate response
32
- outputs = model.generate(
33
- input_ids,
34
- max_new_tokens=max_tokens,
35
- temperature=temperature,
36
- top_p=top_p,
37
- do_sample=True,
38
- pad_token_id=tokenizer.eos_token_id
39
  )
40
-
41
- # Decode the generated tokens, skipping the input tokens
42
- response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
43
- yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
 
46
- """
47
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
48
- """
49
- demo = gr.ChatInterface(
50
- respond,
 
51
  additional_inputs=[
52
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
 
 
 
53
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
54
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
55
  gr.Slider(
56
  minimum=0.1,
57
  maximum=1.0,
58
- value=0.95,
59
  step=0.05,
60
- label="Top-p (nucleus sampling)",
61
  ),
62
  ],
 
63
  )
64
 
65
-
66
  if __name__ == "__main__":
67
- demo.launch()
 
 
 
1
+ import os
2
  import gradio as gr
 
3
  import torch
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+
6
+ # If you have a HF token in the Space secrets, uncomment below:
7
+ # os.environ["HUGGINGFACE_HUB_TOKEN"] = os.getenv("HF_TOKEN", "")
8
+
9
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
 
11
+ # Load tokenizer + model with trust_remote_code, and let Transformers shard/auto‐offload if needed.
12
+ tokenizer = AutoTokenizer.from_pretrained(
13
+ "Fastweb/FastwebMIIA-7B",
14
+ use_fast=True,
15
+ trust_remote_code=True
16
+ )
17
+
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ "Fastweb/FastwebMIIA-7B",
20
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
21
+ device_map="auto", # let HF accelerate/device_map place layers automatically
22
+ trust_remote_code=True
23
+ )
24
 
25
+ model.eval() # set to eval mode
26
 
27
  def respond(
28
+ message: str,
29
  history: list[tuple[str, str]],
30
+ system_message: str,
31
+ max_tokens: int,
32
+ temperature: float,
33
+ top_p: float,
34
  ):
35
+ """
36
+ Build a list of messages in the format the model expects, apply any chat template,
37
+ tokenize, generate, and decode. Wrap inference in torch.no_grad() to save memory.
38
+ """
39
+ # 1) Build the “chat” message list
40
+ messages = []
41
+ if system_message:
42
+ messages.append({"role": "system", "content": system_message})
43
 
44
+ for user_msg, bot_msg in history:
45
+ if user_msg:
46
+ messages.append({"role": "user", "content": user_msg})
47
+ if bot_msg:
48
+ messages.append({"role": "assistant", "content": bot_msg})
49
 
50
  messages.append({"role": "user", "content": message})
51
 
52
+ # 2) Format via the model’s chat template
53
+ # Note: many community‐models define `apply_chat_template`.
54
+ input_text = tokenizer.apply_chat_template(
55
+ messages,
56
+ tokenize=False,
57
+ add_generation_prompt=True
 
 
 
 
 
 
58
  )
59
+ inputs = tokenizer(input_text, return_tensors="pt")
60
+ input_ids = inputs.input_ids.to(DEVICE)
61
+ attention_mask = inputs.attention_mask.to(DEVICE)
62
+
63
+ # 3) Inference under no_grad
64
+ with torch.no_grad():
65
+ outputs = model.generate(
66
+ input_ids=input_ids,
67
+ attention_mask=attention_mask,
68
+ max_new_tokens=max_tokens,
69
+ temperature=temperature,
70
+ top_p=top_p,
71
+ do_sample=True,
72
+ pad_token_id=tokenizer.eos_token_id,
73
+ )
74
+
75
+ # 4) Skip the prompt tokens and decode only the newly generated tokens
76
+ generated_tokens = outputs[0][input_ids.shape[1]:]
77
+ response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
78
+ return response
79
 
80
 
81
+ # Build a Gradio ChatInterface; sliders/textbox for system‐prompt and sampling‐params
82
+ chat_interface = gr.ChatInterface(
83
+ fn=respond,
84
+ title="FastwebMIIA‐7B Chatbot",
85
+ description="A simple chat demo using Fastweb/FastwebMIIA‐7B",
86
+ # “additional_inputs” become available above the conversation window
87
  additional_inputs=[
88
+ gr.Textbox(
89
+ value="You are a helpful assistant.",
90
+ label="System message (role: system)"
91
+ ),
92
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
93
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
94
  gr.Slider(
95
  minimum=0.1,
96
  maximum=1.0,
97
+ value=0.9,
98
  step=0.05,
99
+ label="Top-p (nucleus sampling)"
100
  ),
101
  ],
102
+ # You can tweak CSS or theme here if you like; omitted for brevity.
103
  )
104
 
 
105
  if __name__ == "__main__":
106
+ # On HF Spaces, you often want `share=False` (default). If you need to expose a public URL, set True.
107
+ chat_interface.launch(server_name="0.0.0.0", server_port=7860)
108
+