akmalmzkki commited on
Commit
ccb7f32
·
verified ·
1 Parent(s): 7d79d16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -20
app.py CHANGED
@@ -54,39 +54,55 @@
54
  # demo.launch()
55
 
56
  import gradio as gr
57
- from huggingface_hub import InferenceClient
58
- import os
59
 
60
- client = InferenceClient(
61
- model="mistralai/Mistral-Small-24B-Instruct-2501",
62
- token=os.getenv('HF_TOKEN')
 
 
 
 
 
 
63
  )
64
 
65
- def simple_chat_fn(message, system_prompt, max_tokens, temperature, top_p):
66
- prompt = f"{system_prompt}\n\nUser: {message}\nAssistant:"
67
-
68
- response = client.text_generation(
69
- prompt=prompt,
70
- max_new_tokens=max_tokens,
71
- temperature=temperature,
72
- top_p=top_p,
73
- stream=False
74
- )
75
- return response
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  demo = gr.Interface(
78
- fn=simple_chat_fn,
79
  inputs=[
80
  gr.Textbox(lines=2, label="User Message"),
81
  gr.Textbox(value="You are a helpful and concise assistant.", label="System Prompt"),
82
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"),
83
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
84
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
85
  ],
86
  outputs="text",
87
- title="Mistral 24B Chat (Single Turn)",
88
- allow_flagging="never",
 
89
  )
90
 
91
  if __name__ == "__main__":
92
  demo.launch()
 
 
54
  # demo.launch()
55
 
56
  import gradio as gr
57
+ import torch
58
+ from transformers import AutoTokenizer, AutoModelForCausalLM
59
 
60
+ # Load model & tokenizer
61
+ model_id = "mistralai/Mistral-Small-24B-Instruct-2501"
62
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
63
+
64
+ # Load model di CPU
65
+ model = AutoModelForCausalLM.from_pretrained(
66
+ model_id,
67
+ torch_dtype=torch.float32,
68
+ device_map={"": "cpu"}
69
  )
70
 
71
+ # Inference function
72
+ def chat_fn(message, system_prompt, max_tokens, temperature, top_p):
73
+ prompt = f"<s>[INST] {system_prompt.strip()}\n{message.strip()} [/INST]"
74
+
75
+ inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
 
 
 
 
 
 
76
 
77
+ with torch.no_grad():
78
+ output = model.generate(
79
+ **inputs,
80
+ max_new_tokens=max_tokens,
81
+ temperature=temperature,
82
+ top_p=top_p,
83
+ do_sample=True,
84
+ pad_token_id=tokenizer.eos_token_id
85
+ )
86
+
87
+ decoded = tokenizer.decode(output[0], skip_special_tokens=True)
88
+ return decoded.split("[/INST]")[-1].strip()
89
+
90
+ # Gradio UI
91
  demo = gr.Interface(
92
+ fn=chat_fn,
93
  inputs=[
94
  gr.Textbox(lines=2, label="User Message"),
95
  gr.Textbox(value="You are a helpful and concise assistant.", label="System Prompt"),
96
+ gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max Tokens"),
97
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
98
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
99
  ],
100
  outputs="text",
101
+ title="Mistral-Small-24B CPU Chat",
102
+ description="Chatbot menggunakan model Mistral-Small-24B-Instruct-2501 dijalankan lokal via CPU. Ini akan berjalan lambat.",
103
+ flagging_mode="never",
104
  )
105
 
106
  if __name__ == "__main__":
107
  demo.launch()
108
+