Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,7 @@
|
|
1 |
-
# Imports
|
2 |
import gradio as gr
|
3 |
import os
|
4 |
import random
|
5 |
-
import time
|
6 |
-
import spaces
|
7 |
-
import torch
|
8 |
import threading
|
9 |
-
import socket
|
10 |
-
import requests
|
11 |
|
12 |
from llama_cpp import Llama
|
13 |
from huggingface_hub import hf_hub_download
|
@@ -15,124 +9,92 @@ from huggingface_hub import hf_hub_download
|
|
15 |
# Variables
|
16 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
17 |
|
18 |
-
REPO = "
|
19 |
-
FILE = "
|
20 |
|
21 |
TIMEOUT = 30
|
22 |
|
23 |
MAX_SEED = 9007199254740991
|
24 |
-
DEFAULT_INPUT = ""
|
25 |
-
DEFAULT_SYSTEM = "Statical is a totally normal human named 'Statical'. Statical responds very briefly, truthfully, and accurately."
|
26 |
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
DEFAULT_SEPARATOR = ","
|
31 |
-
DEFAULT_STOP_SEQUENCES = f"{TAG_USER}:,{TAG_ASSISTANT}:,</s>"
|
32 |
-
|
33 |
-
model = Llama(model_path=hf_hub_download(repo_id=REPO, filename=FILE, token=HF_TOKEN), n_ctx=32768, n_threads=48, n_batch=512, n_gpu_layers=0, verbose=True)
|
34 |
-
|
35 |
-
# Functions
|
36 |
def get_seed(seed):
|
37 |
-
seed
|
38 |
-
|
39 |
-
return int(seed)
|
40 |
else:
|
41 |
return random.randint(0, MAX_SEED)
|
42 |
|
43 |
-
def generate(
|
44 |
print("[GENERATE] Model is generating...")
|
45 |
-
|
46 |
-
memory = ""
|
47 |
-
for item in history:
|
48 |
-
if item[0]:
|
49 |
-
memory += f"{TAG_USER}: {item[0].strip()}\n"
|
50 |
-
if item[1]:
|
51 |
-
memory += f"{TAG_ASSISTANT}: {item[1].strip()}</s>\n"
|
52 |
-
prompt = f"{system.strip()}\n{memory}{TAG_USER}: {input.strip()}\n{TAG_ASSISTANT}: "
|
53 |
-
|
54 |
-
print(prompt)
|
55 |
-
|
56 |
parameters = {
|
57 |
"prompt": prompt,
|
58 |
"temperature": temperature,
|
59 |
"top_p": top_p,
|
60 |
-
"top_k": top_k,
|
61 |
-
"repeat_penalty":
|
62 |
-
"max_tokens": max_tokens,
|
63 |
-
"stop": [seq.strip() for seq in stop_sequences.split(separator)] if stop_sequences else [],
|
64 |
"seed": get_seed(seed),
|
65 |
-
"stream":
|
66 |
}
|
67 |
-
|
|
|
|
|
68 |
event = threading.Event()
|
|
|
|
|
69 |
|
70 |
try:
|
71 |
output = model.create_completion(**parameters)
|
72 |
print("[GENERATE] Model has generated.")
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
yield buffer
|
83 |
-
timer.cancel()
|
84 |
-
timer = threading.Timer(TIMEOUT, event.set)
|
85 |
-
timer.start()
|
86 |
-
finally:
|
87 |
-
timer.cancel()
|
88 |
-
else:
|
89 |
-
yield output["choices"][0]["text"]
|
90 |
except TimeoutError as e:
|
91 |
yield str(e)
|
92 |
finally:
|
93 |
timer.cancel()
|
94 |
|
95 |
-
@spaces.GPU(duration=15)
|
96 |
-
def gpu():
|
97 |
-
return
|
98 |
-
|
99 |
# Initialize
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
gr.
|
116 |
-
gr.
|
117 |
-
gr.
|
118 |
-
gr.
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
gr.Slider(minimum=1, maximum=2048, step=1, value=50, label="π Top K", render=False),
|
130 |
-
gr.Slider(minimum=0.01, maximum=2, step=0.01, value=1.2, label="π Repetition Penalty", render=False),
|
131 |
-
gr.Slider(minimum=1, maximum=2048, step=1, value=256, label="β³ Max New Tokens", render=False),
|
132 |
-
gr.Textbox(lines=1, value="", label="π± Seed (Blank for random)", render=False),
|
133 |
-
gr.Textbox(lines=1, value=DEFAULT_SEPARATOR, label="π·οΈ Stop Sequences Separator", render=False),
|
134 |
-
gr.Textbox(lines=1, value=DEFAULT_STOP_SEQUENCES, label="π Stop Sequences (Blank for none)", render=False),
|
135 |
-
]
|
136 |
-
)
|
137 |
-
|
138 |
-
main.launch(show_api=False)
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import random
|
|
|
|
|
|
|
4 |
import threading
|
|
|
|
|
5 |
|
6 |
from llama_cpp import Llama
|
7 |
from huggingface_hub import hf_hub_download
|
|
|
9 |
# Variables
|
10 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
11 |
|
12 |
+
REPO = "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF"
|
13 |
+
FILE = "smollm2-1.7b-instruct-q4_k_m.gguf"
|
14 |
|
15 |
TIMEOUT = 30
|
16 |
|
17 |
MAX_SEED = 9007199254740991
|
|
|
|
|
18 |
|
19 |
+
model = Llama(
|
20 |
+
model_path=hf_hub_download(repo_id=REPO, filename=FILE, token=HF_TOKEN),
|
21 |
+
n_ctx=32768,
|
22 |
+
n_threads=48,
|
23 |
+
n_batch=512,
|
24 |
+
n_gpu_layers=0,
|
25 |
+
verbose=True
|
26 |
+
)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def get_seed(seed):
|
29 |
+
if seed and seed.strip().isdigit():
|
30 |
+
return int(seed.strip())
|
|
|
31 |
else:
|
32 |
return random.randint(0, MAX_SEED)
|
33 |
|
34 |
+
def generate(prompt, temperature, top_p, top_k, repetition_penalty, max_tokens, seed):
|
35 |
print("[GENERATE] Model is generating...")
|
36 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
parameters = {
|
38 |
"prompt": prompt,
|
39 |
"temperature": temperature,
|
40 |
"top_p": top_p,
|
41 |
+
"top_k": int(top_k),
|
42 |
+
"repeat_penalty": repetition_penalty,
|
43 |
+
"max_tokens": int(max_tokens),
|
|
|
44 |
"seed": get_seed(seed),
|
45 |
+
"stream": True
|
46 |
}
|
47 |
+
|
48 |
+
print("Parameters:", parameters)
|
49 |
+
|
50 |
event = threading.Event()
|
51 |
+
timer = threading.Timer(TIMEOUT, event.set)
|
52 |
+
timer.start()
|
53 |
|
54 |
try:
|
55 |
output = model.create_completion(**parameters)
|
56 |
print("[GENERATE] Model has generated.")
|
57 |
+
buffer = ""
|
58 |
+
try:
|
59 |
+
for _, item in enumerate(output):
|
60 |
+
if event.is_set():
|
61 |
+
raise TimeoutError("[ERROR] Generation timed out.")
|
62 |
+
buffer += item["choices"][0]["text"]
|
63 |
+
yield buffer
|
64 |
+
finally:
|
65 |
+
timer.cancel()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
except TimeoutError as e:
|
67 |
yield str(e)
|
68 |
finally:
|
69 |
timer.cancel()
|
70 |
|
|
|
|
|
|
|
|
|
71 |
# Initialize
|
72 |
+
model_base = "Any"
|
73 |
+
model_quant = "Any Quant"
|
74 |
+
|
75 |
+
with gr.Blocks() as demo:
|
76 |
+
gr.Markdown("# ποΈβπ¨οΈ WizardLM")
|
77 |
+
gr.Markdown("β’ β‘ A text generation inference for one of the best open-source text models: WizardLM-2-8x22B.")
|
78 |
+
gr.Markdown("β’ β οΈ **WARNING!** The inference is very slow due to the model being **HUGE**; it takes about 10 seconds before it starts generating. Please avoid high max token parameters and sending large amounts of text. Note it uses CPU because running it on GPU overloads the model.")
|
79 |
+
gr.Markdown(f"β’ π Link to models: [{model_base}]({model_base}) (BASE), [{model_quant}]({model_quant}) (QUANT)")
|
80 |
+
|
81 |
+
prompt = gr.Textbox(lines=4, label="Enter your prompt")
|
82 |
+
output = gr.Textbox(lines=10, label="Model output")
|
83 |
+
|
84 |
+
with gr.Accordion("βοΈ Configurations", open=False):
|
85 |
+
temperature = gr.Slider(minimum=0.0, maximum=2.0, value=1.0, step=0.01, label="π‘οΈ Temperature")
|
86 |
+
top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.95, step=0.01, label="π§² Top P")
|
87 |
+
top_k = gr.Slider(minimum=1, maximum=2048, value=50, step=1, label="π Top K")
|
88 |
+
repetition_penalty = gr.Slider(minimum=0.0, maximum=2.0, value=1.2, step=0.01, label="π Repetition Penalty")
|
89 |
+
max_tokens = gr.Slider(minimum=1, maximum=2048, value=256, step=1, label="β³ Max New Tokens")
|
90 |
+
seed = gr.Textbox(lines=1, label="π± Seed (Blank for random)", value="")
|
91 |
+
|
92 |
+
generate_button = gr.Button("Generate")
|
93 |
+
|
94 |
+
generate_button.click(
|
95 |
+
fn=generate,
|
96 |
+
inputs=[prompt, temperature, top_p, top_k, repetition_penalty, max_tokens, seed],
|
97 |
+
outputs=output,
|
98 |
+
)
|
99 |
+
|
100 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|