Lyte commited on
Commit
fce1fce
·
verified ·
1 Parent(s): 709df81

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -0
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import copy
4
+ from llama_cpp import Llama
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ # Initialize Llama model from Hugging Face
8
+ llm = Llama(
9
+ model_path=hf_hub_download(
10
+ repo_id=os.environ.get("REPO_ID", "mradermacher/Atlas-Chat-9B-GGUF"),
11
+ filename=os.environ.get("MODEL_FILE", "Atlas-Chat-9B.Q4_K_M.gguf"),
12
+ ),
13
+ n_ctx=4096,
14
+ n_gpu_layers=-1,
15
+ )
16
+
17
+ # Training prompt format for Atlas-Chat style conversation
18
+ training_prompt = """<bos><start_of_turn>user
19
+ {}<end_of_turn>
20
+ <start_of_turn>model
21
+ {}<end_of_turn>"""
22
+
23
+ EOS_TOKEN = "<end_of_turn>"
24
+
25
+ # Function to generate the text response based on conversation history
26
+ def generate_text(
27
+ message,
28
+ history: list[tuple[str, str]],
29
+ max_tokens,
30
+ temperature,
31
+ top_p,
32
+ ):
33
+ temp = ""
34
+ input_prompt = ""
35
+
36
+ # Loop through the conversation history and add each turn to the prompt
37
+ for user_input, assistant_response in history:
38
+ input_prompt += training_prompt.format(user_input, assistant_response)
39
+
40
+ # Add the current message to the prompt
41
+ input_prompt += training_prompt.format(message, "")
42
+
43
+ # Generate the output using the model
44
+ output = llm(
45
+ input_prompt,
46
+ temperature=temperature,
47
+ top_p=top_p,
48
+ top_k=40,
49
+ repeat_penalty=1.1,
50
+ max_tokens=max_tokens,
51
+ stop=[
52
+ EOS_TOKEN,
53
+ "<|endoftext|>"
54
+ ],
55
+ stream=True,
56
+ )
57
+
58
+ # Stream and yield the model’s output
59
+ for out in output:
60
+ stream = copy.deepcopy(out)
61
+ temp += stream["choices"][0]["text"]
62
+ yield temp
63
+
64
+ # Define the Gradio interface
65
+ demo = gr.ChatInterface(
66
+ generate_text,
67
+ title="Llama-3.1-8B-Instruct-Reasoner",
68
+ description="Running LLM with https://github.com/abetlen/llama-cpp-python",
69
+ examples=[
70
+ ['How to setup a human base on Mars? Give short answer.'],
71
+ ['Explain theory of relativity to me like I’m 8 years old.'],
72
+ ['شكون لي صنعك؟'],
73
+ ['أشنو كايمييز المملكة المغربية'],
74
+ ['شنو كيتسمى المنتخب المغربي؟']
75
+ ],
76
+ cache_examples=False,
77
+ retry_btn=None,
78
+ undo_btn="Delete Previous",
79
+ clear_btn="Clear",
80
+ additional_inputs=[
81
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
82
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
83
+ gr.Slider(
84
+ minimum=0.1,
85
+ maximum=1.0,
86
+ value=0.95,
87
+ step=0.05,
88
+ label="Top-p (nucleus sampling)",
89
+ ),
90
+ ],
91
+ )
92
+
93
+ # Launch the Gradio demo interface
94
+ if __name__ == "__main__":
95
+ demo.launch()